In [33]:
from textblob import TextBlob
from sklearn import model_selection,preprocessing,linear_model,naive_bayes,metrics
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import decomposition,ensemble

import pandas,xgboost,numpy,textblob,string
from keras.preprocessing import text,sequence
from keras import layers,models,optimizers

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


import warnings
warnings.filterwarnings("ignore")

In [34]:
import pandas as pd
import numpy as np

data=pd.read_excel("dataset.xls")
data=data.dropna()


In [35]:
df=pd.DataFrame()
df["text"]=data["ACIKLAMA"]
df["label"]=data["ILISKILI_BIRIM"]

In [36]:
# Camel case Pascal case Transformation:
# Büyük harf - Küçük harf Dönüşümü:
df["text"]=df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [37]:
# Cleaning Punctuation:
# Noktalama İşaretlerini Silme:
df["text"]=df["text"].str.replace("[^\w\s]","")

In [38]:
# Cleaning numbers:
# Sayıları silme:
df["text"]=df["text"].str.replace("\d","")

In [39]:
# STEMMING
# Kelimeleri köklerine indirgeme:
from trnlp import TrnlpWord

obj=TrnlpWord()

def stemming(text):
    words=[]
    for i in text.split():
        obj.setword(i)
        words.append(obj.get_stem)
    result=" ".join(words)
    return result
df["text"]=df["text"].apply(stemming)

In [40]:
# STOPWORDS:
# Etkisiz Kelimeler:
import nltk
from nltk.corpus import stopwords
sw=stopwords.words("turkish")
df["text"]=df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [41]:
# Deletion of less used word:
# Seyrek kelimelerin silinmesi:
sil=pd.Series(" ".join(df["text"]).split()).value_counts()[-1000:]
df["text"]=df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sil))

In [42]:
# LEMMATIZATION:
from textblob import Word
df["text"]=df["text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split() ]))

In [43]:
train_x,test_x,train_y,test_y=model_selection.train_test_split(df["text"],
                                                              df["label"],
                                                              random_state=42)

In [44]:
encoder=preprocessing.LabelEncoder()

In [45]:
train_y=encoder.fit_transform(train_y)
test_y=encoder.fit_transform(test_y)

In [46]:
type(train_y)

numpy.ndarray

# WORD LEVEL TF - IDF

In [47]:
tf_idf_word_vectorizer=TfidfVectorizer()
tf_idf_word_vectorizer.fit(train_x)

TfidfVectorizer()

In [48]:
x_train_tf_idf_word=tf_idf_word_vectorizer.transform(train_x)
x_test_tf_idf_word=tf_idf_word_vectorizer.transform(test_x)

## ML: LOGISTIC REGRESSION

In [50]:
loj=linear_model.LogisticRegression()
loj_model=loj.fit(x_train_tf_idf_word,train_y)
accuracy=model_selection.cross_val_score(loj_model,
                                        x_test_tf_idf_word,
                                        test_y,
                                        cv=10).mean()
print("Word Level TF IDF Doğruluk Oranı: ",accuracy)

Word Level TF IDF Doğruluk Oranı:  0.7315102040816326


## ML: NAIVE BAYES

In [51]:
nb=naive_bayes.MultinomialNB()
nb_model=nb.fit(x_train_tf_idf_word,train_y)
accuracy=model_selection.cross_val_score(nb_model,
                                        x_test_tf_idf_word,
                                        test_y,
                                        cv=10).mean()
print("Word Level TF IDF Doğruluk Oranı: ",accuracy)

Word Level TF IDF Doğruluk Oranı:  0.6973877551020408


## ML: RANDOM FOREST

In [52]:
rf=ensemble.RandomForestClassifier()
rf_model=rf.fit(x_train_tf_idf_word,train_y)
accuracy=model_selection.cross_val_score(rf_model,
                                        x_test_tf_idf_word,
                                        test_y,
                                        cv=10).mean()
print("Word Level TF IDF Doğruluk Oranı: ",accuracy)

Word Level TF IDF Doğruluk Oranı:  0.7495102040816326


### ML: XGBoost

In [53]:
xgb=xgboost.XGBClassifier()
xgb_model=xgb.fit(x_train_tf_idf_word,train_y)
accuracy=model_selection.cross_val_score(xgb_model,
                                        x_test_tf_idf_word,
                                        test_y,
                                        cv=10).mean()
print("Word Level TF IDF Doğruluk Oranı: ",accuracy)

Word Level TF IDF Doğruluk Oranı:  0.765469387755102


### ML: SVM

In [54]:
svm=SVC()
svm_model=svm.fit(x_train_tf_idf_word,train_y)
accuracy=model_selection.cross_val_score(svm_model,
                                        x_test_tf_idf_word,
                                        test_y,
                                        cv=10).mean()
print("Word Level TF IDF Doğruluk Oranı: ",accuracy)

Word Level TF IDF Doğruluk Oranı:  0.7235102040816326


### ML: KNN

In [55]:
knn=KNeighborsClassifier()
knn_model=knn.fit(x_train_tf_idf_word,train_y)
accuracy=model_selection.cross_val_score(knn_model,
                                        x_test_tf_idf_word,
                                        test_y,
                                        cv=10).mean()
print("Word Level TF IDF Doğruluk Oranı: ",accuracy)

Word Level TF IDF Doğruluk Oranı:  0.7636326530612245


###### Logistic Regression: 0.7315102040816326

###### Naive Bayes: 0.6973877551020408

###### Random Forest: 0.7495102040816326

# XGBoost: 0.765469387755102

###### SVM: 0.7235102040816326

# KNN: 0.7636326530612245