# TODO 2nd
# Sklearn Classifiers

In [21]:
# data analysis pkg
import pandas as pd

# Feature extraction and splitting pkg
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from hazm import word_tokenize, stopwords_list

# For defining models and metrics pkg
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import (LogisticRegression, RidgeClassifier, SGDClassifier)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import (accuracy_score, recall_score, confusion_matrix, f1_score, 
                             roc_auc_score, classification_report)
from sklearn.dummy import DummyClassifier

from sklearn.pipeline import Pipeline
# Avoiding warnings
import warnings
import os
########### Prevent Warnings ###########
warnings.filterwarnings(action='ignore')
########### Prevent Warnings ###########

In [22]:
df = pd.read_csv('data/cleaned/data.csv')
df.title = df.title.astype('str')
df.text = df.text.astype('str')
df.comment = df.comment.astype('str')
df.rate = df.rate.astype('category')

In [23]:
features = ['text']
label = ['verification_status']

X_train, X_test, Y_train, Y_test = train_test_split(df[features], df[label], test_size=0.3)

In [24]:
X_train = [text for text in X_train.text]
Y_train = [label for label in Y_train.verification_status]

X_test = [text for text in X_test.text]
Y_test = [label for label in Y_test.verification_status]

In [25]:
tfidf = TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)

In [26]:
tfidf_train = tfidf.fit_transform(X_train)

In [27]:
tfidf_train.shape

(125369, 20000)

In [89]:
XGB = Pipeline([
    ('tfidf',TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)), 
    ('XGB', XGBClassifier())
])


NB = Pipeline([
    ('tfidf',TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)), 
    ('NB', BernoulliNB())
])

DT = Pipeline([
    ('tfidf',TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)), 
    ('DT', DecisionTreeClassifier())
])

RF = Pipeline([
    ('tfidf',TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)), 
    ('RF', RandomForestClassifier())
])

SGD = Pipeline([
    ('tfidf',TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)), 
    ('SGD', SGDClassifier())
])

LR = Pipeline([
    ('tfidf',TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)), 
    ('LR', LogisticRegression())
])

GD = Pipeline([
    ('tfidf',TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)), 
    ('GD', GradientBoostingClassifier())
])

DM = Pipeline([
    ('tfidf',TfidfVectorizer(use_idf=True, max_features=20000,tokenizer=word_tokenize)), 
    ('Dummy', DummyClassifier())
])

tfidf_clfs = [GD, LR, SGD, NB, XGB, RF, DT]



In [90]:
for clf in tfidf_clfs:
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_test)
    print('- tfidf: ', clf.steps[1][0], ': accuracy' , accuracy_score(preds, Y_test))
    print('- tfidf: ', clf.steps[1][0], ': RECALL ' , recall_score(preds, Y_test))
    print('- tfidf: ', clf.steps[1][0], ': F1_SCORE' , f1_score(preds, Y_test))
    print('')
    print('-------------')
    
DM.fit(X_train, Y_train)
preds = DM.predict(X_test)
print('- tfidf: ', DM.steps[1][0], ': accuracy' , accuracy_score(preds, Y_test))
print('- tfidf: ', DM.steps[1][0], ': RECALL ' , recall_score(preds, Y_test))
print('- tfidf: ', DM.steps[1][0], ': F1_SCORE' , f1_score(preds, Y_test))


- tfidf:  GD : accuracy 0.8766983063465476
- tfidf:  GD : RECALL  0.8543956043956044
- tfidf:  GD : F1_SCORE 0.45798903706127786
- tfidf:  GD : ROC 0.8662710174037324

-------------
- tfidf:  LR : accuracy 0.8976363297971338
- tfidf:  LR : RECALL  0.8315060588574726
- tfidf:  LR : F1_SCORE 0.6111975116640747
- tfidf:  LR : ROC 0.8681133764234408

-------------
- tfidf:  SGD : accuracy 0.8810534152242695
- tfidf:  SGD : RECALL  0.8776595744680851
- tfidf:  SGD : F1_SCORE 0.48171275646743983
- tfidf:  SGD : ROC 0.8794705531340148

-------------
- tfidf:  NB : accuracy 0.8805509026614554
- tfidf:  NB : RECALL  0.7381804482953476
- tfidf:  NB : F1_SCORE 0.5498035914702581
- tfidf:  NB : ROC 0.8171706024959111

-------------
- tfidf:  XGB : accuracy 0.8742043551088777
- tfidf:  XGB : RECALL  0.8605800922874094
- tfidf:  XGB : F1_SCORE 0.43585677322427174
- tfidf:  XGB : ROC 0.8677999088547667

-------------
- tfidf:  RF : accuracy 0.8836962590731435
- tfidf:  RF : RECALL  0.8235011990407674

In [91]:
XGB = Pipeline([
    ('countvect',CountVectorizer(max_features=20000, tokenizer=word_tokenize, stop_words=stopwords_list())), 
    ('XGB', XGBClassifier())
])


NB = Pipeline([
    ('countvect',CountVectorizer(max_features=20000, tokenizer=word_tokenize, stop_words=stopwords_list())), 
    ('NB', BernoulliNB())
])

DT = Pipeline([
    ('countvect',CountVectorizer(max_features=20000, tokenizer=word_tokenize, stop_words=stopwords_list())), 
    ('DT', DecisionTreeClassifier())
])

RF = Pipeline([
    ('countvect',CountVectorizer(max_features=20000, tokenizer=word_tokenize, stop_words=stopwords_list())), 
    ('RF', RandomForestClassifier())
])

SGD = Pipeline([
    ('countvect',CountVectorizer(max_features=20000, tokenizer=word_tokenize, stop_words=stopwords_list())), 
    ('SGD', SGDClassifier())
])

LR = Pipeline([
    ('countvect', CountVectorizer(max_features=20000, tokenizer=word_tokenize, stop_words=stopwords_list())), 
    ('LR', LogisticRegression())
])

GD = Pipeline([
    ('countvect', CountVectorizer(max_features=20000, tokenizer=word_tokenize, stop_words=stopwords_list())), 
    ('GD', GradientBoostingClassifier())
])

DM = Pipeline([
    ('countvect', CountVectorizer(max_features=20000, tokenizer=word_tokenize, stop_words=stopwords_list())), 
    ('Dummy', DummyClassifier())
])

countvect_clfs = [GD, LR, SGD, NB, XGB, RF, DT]

In [92]:
for clf in countvect_clfs:
    clf.fit(X_train, Y_train)
    preds = clf.predict(X_test)
    print('- countvect: ', clf.steps[1][0], ': accuracy' , accuracy_score(preds, Y_test))
    print('- countvect: ', clf.steps[1][0], ': RECALL ' , recall_score(preds, Y_test))
    print('- countvect: ', clf.steps[1][0], ': F1_SCORE' , f1_score(preds, Y_test))
    print('')
    
DM.fit(X_train, Y_train)
preds = DM.predict(X_test)
print('- tfidf: ', clf.steps[1][0], ': accuracy' , accuracy_score(preds, Y_test))
print('- tfidf: ', clf.steps[1][0], ': RECALL ' , recall_score(preds, Y_test))
print('- tfidf: ', clf.steps[1][0], ': F1_SCORE' , f1_score(preds, Y_test))


- countvect:  GD : accuracy 0.8742229666852782
- countvect:  GD : RECALL  0.8283828382838284
- countvect:  GD : F1_SCORE 0.4496742671009773
- countvect:  GD : ROC 0.8528187183859169
- countvect:  LR : accuracy 0.8914200632793597
- countvect:  LR : RECALL  0.7673939185706923
- countvect:  LR : F1_SCORE 0.6049566630552545
- countvect:  LR : ROC 0.8369416523492799
- countvect:  SGD : accuracy 0.8916620137725665
- countvect:  SGD : RECALL  0.8040856031128405
- countvect:  SGD : F1_SCORE 0.5867821395612975
- countvect:  SGD : ROC 0.852505859798857
- countvect:  NB : accuracy 0.8805509026614554
- countvect:  NB : RECALL  0.7381804482953476
- countvect:  NB : F1_SCORE 0.5498035914702581
- countvect:  NB : ROC 0.8171706024959111
- countvect:  XGB : accuracy 0.8726968174204355
- countvect:  XGB : RECALL  0.8354027379815345
- countvect:  XGB : F1_SCORE 0.4341495698213105
- countvect:  XGB : ROC 0.8552075462229719
- countvect:  RF : accuracy 0.879322538619021
- countvect:  RF : RECALL  0.77397107

### BEST RESULTS

Word2Vec - > max_featres = 500000 words

TF-IDF vectorizer = tokenizer=word_tokenize, stop_words=stopwords_list()

- XGBoost  ROC with TF-IDF 0.8698764600833205
- XGBoost  RECALL with TF-IDF 0.8632228719948019
- XGBoost  F1_SCORE with TF-IDF 0.44323963633330554
- XGBoost  ROC with CountVectorizer 0.8632321810604328
- XGBoost  RECALL with TF-IDF 0.850452196382429
- XGBoost  F1_SCORE with TF-IDF 0.4385774964603982


- DecissionTree  ROC with TF-IDF 0.7331272294078325
- DecissionTree  RECALL with TF-IDF 0.563496078927397
- DecissionTree  F1_SCORE with TF-IDF 0.5298210144496641
- DecissionTree  ROC with CountVectorizer 0.7190879149589758
- DecissionTree  RECALL with TF-IDF 0.5359281437125748
- DecissionTree  F1_SCORE with TF-IDF 0.5185099356931812


- RandomForest  ROC with TF-IDF 0.8548763832596875
- RandomForest  RECALL with TF-IDF 0.8246113989637306
- RandomForest  F1_SCORE with TF-IDF 0.49847310312426585
- RandomForest  ROC with CountVectorizer 0.8348634819172763
- RandomForest  RECALL with TF-IDF 0.7804933242815116
- RandomForest  F1_SCORE with TF-IDF 0.5174793698424607



- StochasticGD  ROC with TF-IDF 0.8841407191082541
- StochasticGD  RECALL with TF-IDF 0.8857988165680474
- StochasticGD  F1_SCORE with TF-IDF 0.4871857456675617
- StochasticGD  ROC with CountVectorizer 0.8633177818641357
- StochasticGD  RECALL with TF-IDF 0.8245125348189415
- StochasticGD  F1_SCORE with TF-IDF 0.5946760421898544


- LogisticRegression  ROC with TF-IDF 0.8720273079056319
- LogisticRegression  RECALL with TF-IDF 0.8391389432485323
- LogisticRegression  F1_SCORE with TF-IDF 0.6116539476499536
- LogisticRegression  ROC with CountVectorizer 0.8398848822483052
- LogisticRegression  RECALL with TF-IDF 0.7724913494809689
- LogisticRegression  F1_SCORE with TF-IDF 0.6078551494112041


- Gradient Descent_ensembele  ROC with TF-IDF 0.8672471652995906
- Gradient Descent_ensembele  RECALL with TF-IDF 0.8555725190839695
- Gradient Descent_ensembele  F1_SCORE with TF-IDF 0.45987198424421466
- Gradient Descent_ensembele  ROC with CountVectorizer 0.8586466391934725
- Gradient Descent_ensembele  RECALL with TF-IDF 0.8386322735452909
- Gradient Descent_ensembele  F1_SCORE with TF-IDF 0.4566761943650469



- BernouliNB  ROC with TF-IDF 0.8250477680396926
- BernouliNB  RECALL with TF-IDF 0.7519863791146425
- BernouliNB  F1_SCORE with TF-IDF 0.5599774600267663
- BernouliNB  ROC with CountVectorizer 0.8250477680396926
- BernouliNB  RECALL with TF-IDF 0.7519863791146425
- BernouliNB  F1_SCORE with TF-IDF 0.5599774600267663


