# Turkish Cyber Bullying 

The data I have used : https://www.kaggle.com/abozyigit/turkish-cyberbullying

In [1]:
import numpy as np 
import pandas as pd 

import matplotlib
import matplotlib.pyplot as plt

import sklearn

import warnings
warnings.filterwarnings('ignore')

pd.set_option('max_colwidth', 800)

%matplotlib inline

In [2]:
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

In [3]:
print("Pandas", pd.__version__)
print("Numpy", np.__version__)
print("Matplotlib", matplotlib.__version__)
print("Sclearn", sklearn.__version__ )

Pandas 0.22.0
Numpy 1.13.3
Matplotlib 2.2.2
Sclearn 0.20.0


## Get the Data

In [4]:
df = pd.read_csv('dataset/turkish-cyber-bullying.csv')
df.head()

Unnamed: 0,message,cyberbullying
0,rabbim kalan ömrünü geçen ömründen hayırlı eylesin,0
1,bir ateist olarak bu resmi gördükçe gözyaşlarıma mani olamıyorum,0
2,oo süpersin azıcık bize de bulaşsa,0
3,bende biliyorum benden bı bok olmicak,1
4,nerdesin len tirrek,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3001 entries, 0 to 3000
Data columns (total 2 columns):
message          3001 non-null object
cyberbullying    3001 non-null int64
dtypes: int64(1), object(1)
memory usage: 47.0+ KB


In [6]:
print("Samples for binary classification: {}".format(np.bincount(df.cyberbullying)))

Samples for binary classification: [1498 1503]


_1498_ 0s and _1503_ 1s in the cyberbullying column

## Create a Test Set

In [7]:
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_index, test_index in split.split(df, df.cyberbullying):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [8]:
strat_test_set.cyberbullying.value_counts() / len(strat_test_set)

1    0.500832
0    0.499168
Name: cyberbullying, dtype: float64

In [9]:
strat_train_set.cyberbullying.value_counts() / len(strat_train_set)

1    0.500833
0    0.499167
Name: cyberbullying, dtype: float64

In [10]:
strat_train_set.shape

(2400, 2)

In [11]:
strat_test_set.shape

(601, 2)

## Discover the Data

In [12]:
cyber = strat_train_set.copy()

In [13]:
x_cyber = cyber.message

In [14]:
x_cyber.shape

(2400,)

### CountVectorizer

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()

vect.fit(x_cyber)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [16]:
print("The number of vocabularies: {}".format(len(vect.vocabulary_)))
print("Vocabulary 'affet' occurs : {} times.".format(vect.vocabulary_['affet']))

The number of vocabularies: 9687
Vocabulary 'affet' occurs : 178 times.


In [17]:
bag_of_words = vect.transform(x_cyber)
repr(bag_of_words)

"<2400x9687 sparse matrix of type '<class 'numpy.int64'>'\n\twith 21995 stored elements in Compressed Sparse Row format>"

In [18]:
feature_names = vect.get_feature_names()

print("First 20 vocabularies:\n{}".format(feature_names[:20]))
print("Vocabularies 3010 to 3030:\n{}".format(feature_names[3010:3030]))

First 20 vocabularies:
['09052018', '0islam', '10', '100', '100m', '1085', '10yıl', '11', '110', '12', '1200', '13', '14', '15', '1600', '165', '18', '1957', '1975', '1takımdan']
Vocabularies 3010 to 3030:
['flamingo', 'flamingolar', 'flamingolara', 'flamingoları', 'flamingoların', 'flood', 'floryadan', 'flört', 'fondotensiz', 'football', 'forma', 'formasına', 'formasıyla', 'format', 'forumlar', 'forvet', 'foseptikten', 'foto', 'fotolarım', 'fotolarını']


#### CountVectorizer `min_df`

In [19]:
vect = CountVectorizer(min_df=5)

vect.fit(x_cyber)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [20]:
bag_of_words = vect.transform(x_cyber)
repr(bag_of_words)

"<2400x649 sparse matrix of type '<class 'numpy.int64'>'\n\twith 10378 stored elements in Compressed Sparse Row format>"

In [21]:
feature_names = vect.get_feature_names()

print("First 20 vocabularies:\n{}".format(feature_names[:20]))
print("Vocabularies 3010 to 3030:\n{}".format(feature_names[3010:3030]))

First 20 vocabularies:
['10', '20', 'aa', 'abaza', 'abi', 'acaba', 'acı', 'adalet', 'adam', 'adamsın', 'adem', 'adi', 'adı', 'ahmak', 'akşam', 'akşamlar', 'al', 'alan', 'aldım', 'allah']
Vocabularies 3010 to 3030:
[]


#### CountVectorizer `stop_words`

In [22]:
from nltk.corpus import stopwords
stops = stopwords.words("turkish")

In [23]:
vect = CountVectorizer(min_df=5, stop_words=stops)

vect.fit(x_cyber)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [24]:
bag_of_words = vect.transform(x_cyber)
repr(bag_of_words)

"<2400x607 sparse matrix of type '<class 'numpy.int64'>'\n\twith 8020 stored elements in Compressed Sparse Row format>"

In [25]:
feature_names = vect.get_feature_names()

print("First 20 vocabularies:\n{}".format(feature_names[:20]))
print("Vocabularies 3010 to 3030:\n{}".format(feature_names[3010:3030]))

First 20 vocabularies:
['10', '20', 'aa', 'abaza', 'abi', 'acı', 'adalet', 'adam', 'adamsın', 'adem', 'adi', 'adı', 'ahmak', 'akşam', 'akşamlar', 'al', 'alan', 'aldım', 'allah', 'allahım']
Vocabularies 3010 to 3030:
[]


#### CountVectorizer `ngram`

In [26]:
vect = CountVectorizer(min_df=5, stop_words=stops, ngram_range=(1,3))

vect.fit(x_cyber)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=5,
        ngram_range=(1, 3), preprocessor=None,
        stop_words=['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani'],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [27]:
bag_of_words = vect.transform(x_cyber)
repr(bag_of_words)

"<2400x632 sparse matrix of type '<class 'numpy.int64'>'\n\twith 8236 stored elements in Compressed Sparse Row format>"

In [28]:
feature_names = vect.get_feature_names()

print("First 20 vocabularies:\n{}".format(feature_names[:20]))
print("Vocabularies 3010 to 3030:\n{}".format(feature_names[3010:3030]))

First 20 vocabularies:
['10', '20', 'aa', 'abaza', 'abi', 'acı', 'adalet', 'adam', 'adamsın', 'adem', 'adi', 'adı', 'ahmak', 'akşam', 'akşamlar', 'al', 'alan', 'aldım', 'allah', 'allahım']
Vocabularies 3010 to 3030:
[]


### TfidfVectorizer

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
for min_df in [1,2,3,4,5,6]:
    for n_gram in [(1,1), (1,2), (1,3), (2,3)]:
        
        tf = TfidfVectorizer(min_df=min_df, stop_words=stops, ngram_range=n_gram)

        x_cyber_train = tf.fit_transform(x_cyber)

        max_value = x_cyber_train.max(axis=0).toarray().ravel()

        sorted_by_tfidf = max_value.argsort()

        feature_names = np.array(tf.get_feature_names())

        print("Vocabularies using min_df={} and n_gram={} with highest tfidf: \n{}".format(min_df, n_gram, feature_names[sorted_by_tfidf[-20:]]))

        print("The number of vocabularies: {}".format(len(tf.vocabulary_)))

        sorted_by_idf = np.argsort(tf.idf_)
        print("Vocabularies with lowest idf:\n{}".format(feature_names[sorted_by_idf[:20]]))
        print('-----------------------------------')

Vocabularies using min_df=1 and n_gram=(1, 1) with highest tfidf: 
['bam' 'sınav' 'gittin' 'sıffır' 'uzat' 'olmuyor' 'hödük' 'vay' 'tarzdır'
 'hee' 'şişko' 'sisko' 'tırrek' 'diyarbakır' 'tatlısın' 'tatlı' 'styling'
 'andaval' 'güzelsin' '09052018']
The number of vocabularies: 9635
Vocabularies with lowest idf:
['bir' 'sen' 'ben' 'kadar' 'senin' 'var' 'bi' 'orospu' 'amk' 'seni' 'olsun'
 'mutluyum' 'lan' 'pislik' 'gerizeka' 'yok' 'sana' 'mi' 'hayırlı' 'güzel']
-----------------------------------
Vocabularies using min_df=1 and n_gram=(1, 2) with highest tfidf: 
['angut var' 'mutluyum aq' 'güzelsin çocuk' 'harikasın sen' 'güzelsin kız'
 'embesil seni' 'güzelsin güzel' 'uzat' 'ulan amk' 'hee' 'şişko' 'andaval'
 'styling' 'tırrek' 'sisko' 'güzelsin' 'tatlı' 'tatlısın' 'diyarbakır'
 '09052018']
The number of vocabularies: 26375
Vocabularies with lowest idf:
['bir' 'sen' 'ben' 'kadar' 'senin' 'var' 'bi' 'orospu' 'amk' 'seni' 'olsun'
 'lan' 'mutluyum' 'pislik' 'gerizeka' 'yok' 'sana' 'mi' 'hay

## Select and Train Models

In [31]:
X_train = strat_train_set.message
y_train = strat_train_set.cyberbullying

In [32]:
X_test = strat_test_set.message
y_test = strat_test_set.cyberbullying

### Train Models for `CountVectorizer`

In [53]:
from sklearn.pipeline import Pipeline

svc_pipeline = Pipeline([
        ('countvectorizer', CountVectorizer()),
        ('linearsvc', LinearSVC(max_iter=1000))
])

naive_pipeline = Pipeline([
        ('countvectorizer', CountVectorizer()),
        ('multinomialnb', MultinomialNB())
])

tree_pipeline = Pipeline([
        ('countvectorizer', CountVectorizer()),
        ('decisiontreeclassifier', DecisionTreeClassifier())
])

forest_pipeline = Pipeline([
        ('countvectorizer', CountVectorizer()),
        ('randomforestclassifier', RandomForestClassifier(n_estimators=100))
])

In [54]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score

model_names = ["Linear SVC", "Multinomial Naive Bayes", "Decision Tree", "Random Forest"]

In [184]:
param_svc = [ 
    {
        'linearsvc__C': [0.01, 0.1, 1, 10, 100], 
        'countvectorizer__min_df': [1,2,3,4,5], 
        'countvectorizer__stop_words': [None, stops],
        'countvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)]
    } 
]

params_general = [ 
    {
        'countvectorizer__min_df': [1,2,3,4,5], 
        'countvectorizer__stop_words': [None, stops],
        'countvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)]
    }
]

for model, params, name in zip([svc_pipeline, naive_pipeline, tree_pipeline, forest_pipeline],
                               [param_svc, params_general, params_general, params_general],
                                model_names):

    grid = GridSearchCV(model, params, cv=5)
    grid.fit(X_train, y_train)
    print(name)
    print("Best cross-validation score: {:.2f}".format(grid.best_score_ * 100))
    print("Best parameters: ", grid.best_params_)
    
    final_model = grid.best_estimator_
    
    final_train_prediction = final_model.score(X_train, y_train)
    print("Train score: {:.2f}%".format(final_train_prediction * 100))    
    
    y_train_pred = final_model.predict(X_train)
    print(confusion_matrix(y_train, y_train_pred))

    print("Recall score of X_train: {:.2f}%".format(recall_score(y_train, y_train_pred) * 100))
    print("Precision score of X_train: {:.2f}%".format(precision_score(y_train, y_train_pred) * 100))
    print("F1_score: {:.2f}%".format(f1_score(y_train, y_train_pred) * 100))
    
    final_test_prediction = final_model.score(X_test, y_test)
    print("Test score: {:.2f}%".format(final_test_prediction * 100))    
    print("--------------------------")

Linear SVC
Best cross-validation score: 89.75
Best parameters:  {'countvectorizer__stop_words': ['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani'], 'countvectorizer__min_df': 1, 'countvectorizer__ngram_range': (1, 2), 'linearsvc__C': 1}
Train score: 99.96%
[[1198    0]
 [   1 1201]]
Recall score of X_train: 99.92%
Precision score of X_train: 100.00%
F1_score: 99.96%
Test score: 89.85%
--------------------------
Multinomial Naive Bayes
Best cross-validation score: 87.04
Best parameters:  {'countvectorizer__stop_words': ['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de

### Train Models for `TfidfVectorizer`

In [56]:
svc_tfv_pipeline = Pipeline([
        ('tfidfvectorizer', TfidfVectorizer()),
        ('linearsvc', LinearSVC(max_iter=1000))
])

naive_tfv_pipeline = Pipeline([
        ('tfidfvectorizer', TfidfVectorizer()),
        ('multinomialnb', MultinomialNB())
])

tree_tfv_pipeline = Pipeline([
        ('tfidfvectorizer', TfidfVectorizer()),
        ('decisiontreeclassifier', DecisionTreeClassifier())
])

forest_tfv_pipeline = Pipeline([
        ('tfidfvectorizer', TfidfVectorizer()),
        ('randomforestclassifier', RandomForestClassifier(n_estimators=100))
])

In [130]:
param_svc = [ 
    {
        'linearsvc__C': [0.01, 0.1, 1, 10, 100], 
        'tfidfvectorizer__min_df': [1,2,3,4,5], 
        'tfidfvectorizer__stop_words': [stops],
        'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)]
    } 
]

params_general = [ 
    {
        'tfidfvectorizer__min_df': [1,2,3,4,5], 
        'tfidfvectorizer__stop_words': [stops],
        'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)]
    }
]

for model, params, name in zip([svc_tfv_pipeline, naive_tfv_pipeline, tree_tfv_pipeline, forest_tfv_pipeline],
                               [param_svc, params_general, params_general, params_general],
                                model_names):

    grid = GridSearchCV(model, params, cv=10)
    grid.fit(X_train, y_train)
    print(name)
    print("Best cross-validation score: {:.2f}".format(grid.best_score_ * 100))
    print("Best parameters: ", grid.best_params_)
    
    final_model = grid.best_estimator_
    y_train_pred = final_model.predict(X_train)
    
    final_train_prediction = final_model.score(X_train, y_train)
    print("Train score: {:.2f}%".format(final_train_prediction * 100))    
    
    print(confusion_matrix(y_train, y_train_pred))

    print("Recall score of X_train: {:.2f}%".format(recall_score(y_train, y_train_pred) * 100))
    print("Precision score of X_train: {:.2f}%".format(precision_score(y_train, y_train_pred) * 100))
    print("F1_score: {:.2f}%".format(f1_score(y_train, y_train_pred) * 100))
    
    final_test_prediction = final_model.score(X_test, y_test)
    print("Test score: {:.2f}%".format(final_test_prediction * 100))    
    
    y_test_pred = final_model.predict(X_test)
    print("Precision test score: {:.2f}%".format(100 * precision_score(y_test, y_test_pred)))
    print("Recall test score: {:.2f}%".format(100 * recall_score(y_test, y_test_pred)))
    
    print("--------------------------")

Linear SVC
Best cross-validation score: 89.79
Best parameters:  {'tfidfvectorizer__min_df': 1, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': ['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani'], 'linearsvc__C': 1}
Train score: 99.96%
[[1198    0]
 [   1 1201]]
Recall score of X_train: 99.92%
Precision score of X_train: 100.00%
F1_score: 99.96%
Test score: 88.69%
Precision test score: 86.75%
Recall test score: 91.36%
--------------------------
Multinomial Naive Bayes
Best cross-validation score: 87.96
Best parameters:  {'tfidfvectorizer__min_df': 1, 'tfidfvectorizer__ngram_range': (1, 2), 'tfidfvectorizer__stop_words': 

## Pick the 'best' model

### LinearSVC with CountVectorizer

In [186]:
final_svc_pipeline = Pipeline([
        ('countvectorizer', CountVectorizer(min_df=1, ngram_range=(1,2), stop_words=stops)),
        ('linearsvc', LinearSVC(max_iter=1000, C=1))
])

In [187]:
final_svc_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), preprocessor=None,
        stop_words=...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [188]:
final_svc_pipeline.score(X_train, y_train)

0.99958333333333338

In [189]:
final_test_prediction = final_svc_pipeline.score(X_test, y_test)
print("Final Test Score:", final_test_prediction)

y_test_pred = final_svc_pipeline.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))

Final Test Score: 0.89850249584
[[275  25]
 [ 36 265]]


In [190]:
final_svc_pipeline.predict(['Toplumsal '\
                            'cinsiyetçilik içeren kelime yapılarına çalışmalarında yer vermemesi,'\
                            ' davalı idarenin uluslararası ve ulusal normlardan kaynaklanan görevidir'])

array([0])

In [192]:
final_svc_pipeline.predict(['Yapay zeka (AI), bir dijital bilgisayarın veya bilgisayar'\
                                   'kontrollü robotun, akıllı varlıklar ile yaygın olarak '\
                                   'ilişkili görevleri gerçekleştirme kabiliyetidir.'])

array([0])

In [191]:
final_svc_pipeline.predict(['bu ne salak'])

array([1])

### LinearSVC with TfidfVectorizer

In [196]:
final_svc_pipeline = Pipeline([
        ('tfidfvectorizer', TfidfVectorizer(min_df=1, ngram_range=(1,1), stop_words=stops)),
        ('linearsvc', LinearSVC(max_iter=1000, C=1))
])

In [197]:
final_svc_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [198]:
final_svc_pipeline.steps

[('tfidfvectorizer',
  TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
          stop_words=['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani'],
          strip_accents=None, sublinear_tf=False,
          token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
          vocabulary=None)),
 ('linearsvc', LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
       intercept_scaling=1, l

In [199]:
final_svc_pipeline.score(X_train, y_train)

0.99958333333333338

In [200]:
final_test_prediction = final_svc_pipeline.score(X_test, y_test)
print("Final Test Score:", final_test_prediction)

y_test_pred = final_svc_pipeline.predict(X_test)
print(confusion_matrix(y_test, y_test_pred))

Final Test Score: 0.886855241265
[[258  42]
 [ 26 275]]


In [201]:
final_svc_pipeline.predict(['Toplumsal '\
                            'cinsiyetçilik içeren kelime yapılarına çalışmalarında yer vermemesi,'\
                            ' davalı idarenin uluslararası ve ulusal normlardan kaynaklanan görevidir'])

array([1])

In [202]:
final_svc_pipeline.predict(['bu ne salak'])

array([1])

### RandomForestClassifier with TfidfVectorizer

In [86]:
params_general = [ 
    {
        'tfidfvectorizer__min_df': [1,2,3,4,5], 
        'tfidfvectorizer__stop_words': [stops],
        'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)],
        'randomforestclassifier__n_estimators': [5, 10, 20, 30, 50, 60, 70, 80, 90, 100]
    }
]

In [87]:
forest_tfv_pipeline = Pipeline([
        ('tfidfvectorizer', TfidfVectorizer()),
        ('randomforestclassifier', RandomForestClassifier())
])

In [88]:
grid = GridSearchCV(forest_tfv_pipeline, params_general, cv=10)
grid.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid=[{'tfidfvectorizer__min_df': [1, 2, 3, 4, 5], 'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3), (2, 3)], 'tfidfvectorizer__stop_words': [['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', '..., 'ya', 'yani']], 'randomforestclassifier__n_estimators': [5, 10, 20, 30, 50, 60, 70, 80, 90, 100]}],
       pre_dispatch='2*n_jobs', refi

In [99]:
print(grid.best_params_)

{'tfidfvectorizer__min_df': 2, 'randomforestclassifier__n_estimators': 90, 'tfidfvectorizer__ngram_range': (1, 1), 'tfidfvectorizer__stop_words': ['acaba', 'ama', 'aslında', 'az', 'bazı', 'belki', 'biri', 'birkaç', 'birşey', 'biz', 'bu', 'çok', 'çünkü', 'da', 'daha', 'de', 'defa', 'diye', 'eğer', 'en', 'gibi', 'hem', 'hep', 'hepsi', 'her', 'hiç', 'için', 'ile', 'ise', 'kez', 'ki', 'kim', 'mı', 'mu', 'mü', 'nasıl', 'ne', 'neden', 'nerde', 'nerede', 'nereye', 'niçin', 'niye', 'o', 'sanki', 'şey', 'siz', 'şu', 'tüm', 've', 'veya', 'ya', 'yani']}


In [155]:
forest_tfv_final_pipeline = Pipeline([
        ('tfidfvectorizer', TfidfVectorizer(min_df=2, stop_words=stops, ngram_range=(1,1))),
        ('randomforestclassifier', RandomForestClassifier(n_estimators=90))
])

In [156]:
forest_tfv_final_pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=2,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [157]:
forest_tfv_final_pipeline.score(X_train, y_train)

0.9966666666666667

In [158]:
forest_tfv_final_pipeline.score(X_test, y_test)

0.86356073211314477

In [159]:
forest_tfv_final_pipeline.predict(['Yapay zeka (AI), bir dijital bilgisayarın veya bilgisayar'\
                                   'kontrollü robotun, akıllı varlıklar ile yaygın olarak '\
                                   'ilişkili görevleri gerçekleştirme kabiliyetidir.'])

array([0])

In [160]:
forest_tfv_final_pipeline.predict(['Toplumsal '\
                            'cinsiyetçilik içeren kelime yapılarına çalışmalarında yer vermemesi,'\
                            ' davalı idarenin uluslararası ve ulusal normlardan kaynaklanan görevidir'])

array([0])

In [161]:
forest_tfv_final_pipeline.predict(['düşüncelerini kendine sakla'])

array([0])

In [162]:
forest_tfv_final_pipeline.predict(['bu ne salak'])

array([0])

### Save the model

In [193]:
from sklearn.externals import joblib

joblib.dump(final_svc_pipeline, "final_model.pkl")

['final_model.pkl']

### Load the model

In [194]:
my_best_model = joblib.load("final_model.pkl")

final_test_prediction = my_best_model.score(X_test, y_test)
print("Final Test Score:", final_test_prediction)

Final Test Score: 0.89850249584


## Predict unseen data

In [195]:
my_best_model.predict(['gayet iyi', 'konuşma lan', 'bu ne', 'bu ne salak'])

array([0, 1, 0, 1])