In [3]:
from textblob import TextBlob
from sklearn import model_selection,preprocessing,linear_model,naive_bayes,metrics
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn import decomposition,ensemble

import pandas,xgboost,numpy,textblob,string
from keras.preprocessing import text,sequence
from keras import layers,models,optimizers

import warnings
warnings.filterwarnings("ignore")

In [4]:
import pandas as pd
import numpy as np

data=pd.read_excel("dataset.xls")
data=data.dropna()

In [5]:
df=pd.DataFrame()
df["text"]=data["ACIKLAMA"]
df["label"]=data["ILISKILI_BIRIM"]

In [6]:
# Camel case Pascal case Transformation:
# Büyük harf - Küçük harf Dönüşümü:
df["text"]=df["text"].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [7]:
# Cleaning Punctuation:
# Noktalama İşaretlerini Silme:
df["text"]=df["text"].str.replace("[^\w\s]","")

In [8]:
# Cleaning numbers:
# Sayıları silme:
df["text"]=df["text"].str.replace("\d","")

In [9]:
# Stemming:
# Kelimeleri köklerine indirgeme:
from trnlp import TrnlpWord

obj=TrnlpWord()

def stemming(text):
    words=[]
    for i in text.split():
        obj.setword(i)
        words.append(obj.get_stem)
    result=" ".join(words)
    return result
df["text"]=df["text"].apply(stemming)

In [10]:
# STOPWORDS:
# Etkisiz Kelimeler:
import nltk
from nltk.corpus import stopwords
sw=stopwords.words("turkish")
df["text"]=df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sw))

In [11]:
# Deletion of less used word:
# Seyrek kelimelerin silinmesi:
sil=pd.Series(" ".join(df["text"]).split()).value_counts()[-1000:]
df["text"]=df["text"].apply(lambda x: " ".join(x for x in x.split() if x not in sil))

In [12]:
# LEMMATIZATION:
from textblob import Word
df["text"]=df["text"].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split() ]))

In [13]:
train_x,test_x,train_y,test_y=model_selection.train_test_split(df["text"],
                                                              df["label"],
                                                              random_state=42)

In [14]:
encoder=preprocessing.LabelEncoder()

In [15]:
train_y=encoder.fit_transform(train_y)
test_y=encoder.fit_transform(test_y)

In [16]:
train_y[0:20]

array([2, 2, 2, 0, 2, 2, 2, 1, 2, 1, 1, 0, 0, 2, 2, 2, 1, 2, 2, 0])

In [17]:
test_y[0:20]

array([2, 1, 3, 0, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 0, 0, 2, 0])

# COUNT VECTORS

In [18]:
vectorizer=CountVectorizer()
vectorizer.fit(train_x)

CountVectorizer()

In [19]:
x_train_count=vectorizer.transform(train_x)
x_test_count=vectorizer.transform(test_x)

### ML MODEL : LOGISTIC REGRESSION

In [21]:
loj=linear_model.LogisticRegression()
loj_model=loj.fit(x_train_count,train_y)
accuracy=model_selection.cross_val_score(loj_model,
                                        x_test_count,
                                        test_y,
                                        cv=10).mean()
print("Count Vectors Doğruluk Oranı: ",accuracy)

Count Vectors Doğruluk Oranı:  0.7855102040816326


### ML MODEL: NAIVE BAYES

In [22]:
nb=naive_bayes.MultinomialNB()
nb_model=nb.fit(x_train_count,train_y)
accuracy=model_selection.cross_val_score(nb_model,
                                        x_test_count,
                                        test_y,
                                        cv=10).mean()
print("Count Vectors Doğruluk Oranı: ",accuracy)

Count Vectors Doğruluk Oranı:  0.7454693877551021


### ML MODEL: RANDOM FOREST

In [23]:
rf=ensemble.RandomForestClassifier()
rf_model=rf.fit(x_train_count,train_y)
accuracy=model_selection.cross_val_score(rf_model,
                                        x_test_count,
                                        test_y,
                                        cv=10).mean()
print("Count Vectors Doğruluk Oranı: ",accuracy)

Count Vectors Doğruluk Oranı:  0.7415102040816326


### ML MODEL: XGBoost

In [24]:
xgb=xgboost.XGBClassifier()
xgb_model=xgb.fit(x_train_count,train_y)
accuracy=model_selection.cross_val_score(xgb_model,
                                        x_test_count,
                                        test_y,
                                        cv=10).mean()
print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.773469387755102


### ML MODEL: SVM

In [25]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [26]:
svm=SVC()
svm_model=svm.fit(x_train_count,train_y)
accuracy=model_selection.cross_val_score(svm_model,
                                        x_test_count,
                                         test_y,
                                        cv=10).mean()
print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.7174693877551019


### ML MODEL KNN

In [27]:
from sklearn.neighbors import KNeighborsClassifier

In [28]:
knn=KNeighborsClassifier()
knn_model=knn.fit(x_train_count,train_y)
accuracy=model_selection.cross_val_score(knn_model,
                                          x_test_count,
                                          test_y,
                                          cv=10).mean()
print("Count Vectors Doğruluk Oranı:", accuracy)

Count Vectors Doğruluk Oranı: 0.7034285714285715


# Logistic Regression : 0.7855102040816326 

###### Naive Bayes: 0.7454693877551021

###### Random Forest: 0.7415102040816326

# XGBoost: 0.773469387755102

###### SVM: 0.7174693877551019

###### KNN: 0.7034285714285715