In [31]:
from textblob import TextBlob
from sklearn import model_selection,preprocessing,linear_model,naive_bayes,metrics
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import decomposition,ensemble

import pandas,xgboost,numpy,textblob,string
from keras.preprocessing import text,sequence
from keras import layers,models,optimizers

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


import warnings
warnings.filterwarnings("ignore")

In [32]:
import pandas as pd
import numpy as np

data=pd.read_excel("dataset.xls")
data=data.dropna()

In [33]:
df=pd.DataFrame()
df["text"]=data["ACIKLAMA"]
df["label"]=data["ILISKILI_BIRIM"]

In [34]:
# Camel case Pascal case Transformation:
# Büyük harf - Küçük harf Dönüşümü:
df["text"]=df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [35]:
# Cleaning Punctuation:
# Noktalama İşaretlerini Silme:
df["text"]=df["text"].str.replace("[^\w\s]","")

In [36]:
# Cleaning numbers:
# Sayıları silme:
df["text"]=df["text"].str.replace("\d","")

In [37]:
# Stemming:
# Kelimeleri köklerine indirgeme:
from trnlp import TrnlpWord

obj=TrnlpWord()

def stemming(text):
    words=[]
    for i in text.split():
        obj.setword(i)
        words.append(obj.get_stem)
    result=" ".join(words)
    return result
df["text"]=df["text"].apply(stemming)

In [38]:
# STOPWORDS:
# Etkisiz Kelimeler:
import nltk
from nltk.corpus import stopwords
sw=stopwords.words("turkish")
df["text"]=df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [39]:
# Deletion of less used word:
# Seyrek kelimelerin silinmesi:
sil=pd.Series(" ".join(df["text"]).split()).value_counts()[-1000:]
df["text"]=df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sil))

In [40]:
# LEMMATIZATION:
from textblob import Word
df["text"]=df["text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [41]:
train_x,test_x,train_y,test_y=model_selection.train_test_split(df["text"],
                                                              df["label"],
                                                              random_state=42)

In [42]:
train_x.shape

(1494,)

In [43]:
train_y.shape

(1494,)

In [44]:
encoder=preprocessing.LabelEncoder()

In [45]:
train_y=encoder.fit_transform(train_y)
test_y=encoder.fit_transform(test_y)

In [46]:
test_y[0:20]

array([2, 1, 3, 0, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 0, 0, 2, 0])

# CHARACTERS LEVEL TF - IDF

In [47]:
tf_idf_char_vectorizer=TfidfVectorizer(analyzer="char",ngram_range=(1,3))
tf_idf_char_vectorizer.fit(train_x)

TfidfVectorizer(analyzer='char', ngram_range=(1, 3))

In [48]:
x_train_tf_idf_char=tf_idf_char_vectorizer.transform(train_x)
x_test_tf_idf_char=tf_idf_char_vectorizer.transform(test_x)

## ML: LOGISTIC REGRESSION

In [49]:
loj=linear_model.LogisticRegression()
loj_model=loj.fit(x_train_tf_idf_char,train_x)
accuracy=model_selection.cross_val_score(loj_model,
                                        x_test_tf_idf_char,
                                        test_y,
                                        cv=10).mean()
print("Char TF IDF Doğruluk Oranı: ",accuracy)

Char TF IDF Doğruluk Oranı:  0.7254285714285713


## ML: NAIVE BAYES

In [50]:
nb=naive_bayes.MultinomialNB()
nb_model=nb.fit(x_train_tf_idf_char,train_y)
accuracy=model_selection.cross_val_score(nb_model,
                                        x_test_tf_idf_char,
                                        test_y,
                                        cv=10).mean()
print("Char TF IDF Doğruluk Oranı: ",accuracy)

Char TF IDF Doğruluk Oranı:  0.6973877551020408


## ML: RANDOM FOREST

In [51]:
rf=ensemble.RandomForestClassifier()
rf_model=rf.fit(x_train_tf_idf_char,train_y)
accuracy=model_selection.cross_val_score(rf_model,
                                        x_test_tf_idf_char,
                                        test_y,
                                        cv=10).mean()
print("Char TF IDF Doğruluk Oranı: ",accuracy)

Char TF IDF Doğruluk Oranı:  0.7595102040816326


## ML: XGBoost CLASSIFIER

In [52]:
xgb=xgboost.XGBClassifier()
xgb_model=xgb.fit(x_train_tf_idf_char,train_y)
accuracy=model_selection.cross_val_score(xgb_model,
                                        x_test_tf_idf_char,
                                        test_y,
                                        cv=10).mean()
print("Char TF IDF Doğruluk Oranı: ",accuracy)

Char TF IDF Doğruluk Oranı:  0.7575102040816326


## ML: SVM

In [53]:
svm=SVC()
svm_model=svm.fit(x_train_tf_idf_char,train_y)
accuracy=model_selection.cross_val_score(svm_model,
                                        x_test_tf_idf_char,
                                        test_y,
                                        cv=10).mean()
print("Char TF IDF Doğruluk Oranı: ",accuracy)

Char TF IDF Doğruluk Oranı:  0.7375102040816326


## ML: KNN

In [54]:
knn=KNeighborsClassifier()
knn_model=knn.fit(x_train_tf_idf_char,train_y)
accuracy=model_selection.cross_val_score(knn_model,
                                        x_test_tf_idf_char,
                                        test_y,
                                        cv=10).mean()
print("Char TF IDF Doğruluk Oranı: ",accuracy)

Char TF IDF Doğruluk Oranı:  0.7455510204081632


###### Logistic Ragression: 0.7254285714285713

###### Naive Bayes: 0.6973877551020408

# Random Forest: 0.7595102040816326
    
# XGBoost: 0.7575102040816326
    
###### SVM: 0.7375102040816326
    
###### KNN: 0.7455510204081632