In [46]:
from textblob import TextBlob
from sklearn import model_selection,preprocessing,linear_model,naive_bayes,metrics
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import decomposition,ensemble

import pandas,xgboost,numpy,textblob,string
from keras.preprocessing import text,sequence
from keras import layers,models,optimizers

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


import warnings
warnings.filterwarnings("ignore")

In [47]:
import pandas as pd
import numpy as np

data=pd.read_excel("dataset.xls")
data=data.dropna()

In [48]:
df=pd.DataFrame()
df["text"]=data["ACIKLAMA"]
df["label"]=data["ILISKILI_BIRIM"]

In [49]:
# Camel case Pascal case Transformation:
# Büyük harf - Küçük harf Dönüşümü:
df["text"]=df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [50]:
# Cleaning Punctuation:
# Noktalama İşaretlerini Silme:
df["text"]=df["text"].str.replace("[^\w\s]","")

In [51]:
# Cleaning numbers:
# Sayıları silme:
df["text"]=df["text"].str.replace("\d","")

In [52]:
# Stemming:
# Kelimeleri köklerine indirgeme:
from trnlp import TrnlpWord

obj=TrnlpWord()

def stemming(text):
    words=[]
    for i in text.split():
        obj.setword(i)
        words.append(obj.get_stem)
    result=" ".join(words)
    return result
df["text"]=df["text"].apply(stemming)

In [53]:
# STOPWORDS:
# Etkisiz Kelimeler:
import nltk
from nltk.corpus import stopwords
sw=stopwords.words("turkish")
df["text"]=df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [54]:
# Deletion of less used word:
# Seyrek kelimelerin silinmesi:
sil=pd.Series(" ".join(df["text"]).split()).value_counts()[-1000:]
df["text"]=df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sil))

In [55]:
# LEMMATIZATION:
from textblob import Word
df["text"]=df["text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split() ]))

In [56]:
train_x,test_x,train_y,test_y=model_selection.train_test_split(df["text"],
                                                              df["label"],
                                                              random_state=42)

In [57]:
encoder=preprocessing.LabelEncoder()

In [58]:
train_y=encoder.fit_transform(train_y)
test_y=encoder.fit_transform(test_y)

In [59]:
test_y[0:20]

array([2, 1, 3, 0, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 0, 0, 2, 0])

# N - GRAM LEVEL TF - IDF

In [60]:
tf_idf_ngram_vectorizer=TfidfVectorizer(ngram_range=(1,3))
tf_idf_ngram_vectorizer.fit(train_x)

TfidfVectorizer(ngram_range=(1, 3))

In [61]:
x_train_tf_idf_ngram=tf_idf_ngram_vectorizer.transform(train_x)
x_test_tf_idf_ngram=tf_idf_ngram_vectorizer.transform(test_x)

### ML : NAIVE BAYES

In [63]:
nb=naive_bayes.MultinomialNB()
nb_model=nb.fit(x_train_tf_idf_ngram,train_y)
accuracy=model_selection.cross_val_score(nb_model,
                                        x_test_tf_idf_ngram,
                                        test_y,
                                        cv=10).mean()
print("Word Level TF IDF Doğruluk Oranı: ",accuracy)

Word Level TF IDF Doğruluk Oranı:  0.6973877551020408


### ML: RANDOM FOREST

In [64]:
rf=ensemble.RandomForestClassifier()
rf_model=rf.fit(x_train_tf_idf_ngram,train_y)
accuracy=model_selection.cross_val_score(rf_model,
                                        x_test_tf_idf_ngram,
                                        test_y,
                                        cv=10).mean()
print("N-Gram TF IDF Doğruluk Oranı: ",accuracy)

N-Gram TF IDF Doğruluk Oranı:  0.715469387755102


### ML: XGBoost

In [65]:
xgb=xgboost.XGBClassifier()
xgb_model=xgb.fit(x_train_tf_idf_ngram,train_y)
accuracy=model_selection.cross_val_score(xgb_model,
                                        x_test_tf_idf_ngram,
                                        test_y,
                                        cv=10).mean()
print("N-Gram TF IDF Doğruluk Oranı: ",accuracy)

N-Gram TF IDF Doğruluk Oranı:  0.7674285714285715


### ML: SVM

In [66]:
svm=SVC()
svm_model=svm.fit(x_train_tf_idf_ngram,train_y)
accuracy=model_selection.cross_val_score(svm_model,
                                        x_test_tf_idf_ngram,
                                        test_y,cv=10).mean()
print("N-Gram TF IDF Doğruluk Oranı: ",accuracy)

N-Gram TF IDF Doğruluk Oranı:  0.7114693877551019


### ML: KNN

In [67]:
knn=KNeighborsClassifier()
knn_model=knn.fit(x_train_tf_idf_ngram,train_y)
accuracy=model_selection.cross_val_score(knn_model,
                                        x_test_tf_idf_ngram,
                                        test_y,
                                        cv=10).mean()
print("N-Gram TF IDF Doğruluk Oranı: ",accuracy)

N-Gram TF IDF Doğruluk Oranı:  0.7855918367346939


# ngram_range=(2,3)

## Naive Bayes: 0.7033877551020409

###### Random Forest: 0.6274693877551021

###### XGBoost: 0.6933469387755101

## SVM: 0.7094693877551019

###### KNN: 0.5232653061224489

# ngram_range=(1,3)

###### Naive Bayes: 0.6973877551020408

###### Random Forest: 0.715469387755102

## XGBoost: 0.7674285714285715

###### SVM: 0.7114693877551019

## KNN: 0.7855918367346939