#### **This notebook test for different algorithm results for tweet classifier**

In [1]:
import pandas as pd

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as viz_hp
import config.config as config_hp
import matplotlib.pyplot as plt
import helper.stat_helper as stat_hp

In [2]:
importlib.reload(stat_hp)

import importlib

importlib.reload(config_hp)

config = config_hp.config()
stat = config['STATS']

final_stat = stat['final_stat']

df_all_stat = pd.read_pickle(final_stat)

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier

def run_model(df,
              columns_not_include=['list_age'],
              model_type='random', 
              y_column = 'tweet_label',
              filename=None,
             ):
    '''
    Trains the model and prints the result
    :param df: Dataframe
    :param model_type: Type of model
    :param pca: Whether to do PCA or not
    :param columns_not_include: columns to not include
    '''
    print(f'\n **** {model_type} ****')
    
    ### Remove unnecessary columns
    import pickle

    model_filename = filename
    
    columns_not_include.extend(
        ['poster_tweetid','tweet_label', 'replier_userid', 'replier_label'])
    
    columns_to_keep = list(set(df.columns) - set(columns_not_include))

    X = df[columns_to_keep]
    y = df[y_column]
  
    ### Choose model
    if model_type == 'logistic':
        model = LogisticRegression(random_state=0)
    elif model_type == 'random':
        print('Running Random Forest')
        model = RandomForestClassifier(n_estimators=100, 
                                   random_state=42
                                  )
    elif model_type == 'ada':
        from sklearn.ensemble import AdaBoostClassifier
        model = AdaBoostClassifier(n_estimators=100,
                                 algorithm="SAMME", random_state=0)
    elif model_type == 'tree':
        model = tree.DecisionTreeClassifier()
    elif model_type == 'naive':
        from sklearn.naive_bayes import GaussianNB
        model = GaussianNB()
    
    ### Choose scoring function
    from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score

    # Creating a dictionary of scorers
    scoring = {
        'precision': make_scorer(precision_score, average='binary'),
        'recall': make_scorer(recall_score, average='binary'),
        'f1': make_scorer(f1_score, average='binary'),
        'roc_auc': make_scorer(roc_auc_score, needs_proba=True)
    }

    cv_scores = [
        "train_precision",
        "test_precision",
        "train_recall",
        "test_recall",
        "train_f1",
        "test_f1",
        "train_roc_auc",
        "test_roc_auc",
    ]

    from sklearn.model_selection import TunedThresholdClassifierCV
    from sklearn.pipeline import make_pipeline
    from sklearn.model_selection import RepeatedStratifiedKFold
    from sklearn.metrics import f1_score

    model = make_pipeline(StandardScaler(), model)
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42)
    tuned_model = TunedThresholdClassifierCV(estimator=model,
                                             scoring='f1',
                                             store_cv_results = True,
                                             n_jobs=-1
                                            )

    cv_results_tuned_model = pd.DataFrame(
        cross_validate(
            tuned_model,
            X,
            y,
            scoring=scoring,
            cv=cv,
            return_train_score=True,
            return_estimator=True,
        )
    )
   
    from sklearn.metrics import f1_score

    decision_threshold = pd.Series(
        [est.best_threshold_ for est in cv_results_tuned_model["estimator"]],
    )
    cv_results_tuned_model['threshold'] = decision_threshold
    
    cv_results_tuned_model['algorithm'] = model_type
    
    return cv_results_tuned_model

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

from sklearn.model_selection import cross_validate
from sklearn import tree

In [20]:
# run_model(df_all_stat,
#           columns_not_include=[],
#           model_type='logistic', 
#           pca=False,
#           y_column = 'tweet_label',
#           filename=None,
#          just_f1=False,
#          find_threshold=False
#          )

#### **Run for all algorithms**

In [None]:
algorithms = ['logistic', 'ada', 'random', 'tree', 'naive']
all_results = []
for algo in algorithms:
    df_result = run_model(df_all_stat,
                   columns_not_include=['list_age'],
                   model_type=algo, 
                   y_column = 'tweet_label',
                   filename=None,
                  )
    
    all_results.append(df_result)
    
(pd.concat(all_results, ignore_index=True)
).to_pickle('./data/tweet_classifier_different_algorithm.pkl.gz')

In [12]:
df_result = pd.read_pickle(
    './data/tweet_classifier_different_algorithm.pkl.gz'
)

In [28]:
df_result.loc[df_result['algorithm'] == 'logistic'].mean()

  df_result.loc[df_result['algorithm'] == 'logistic'].mean()


fit_time           0.812179
score_time         0.010199
test_precision     0.657619
train_precision    0.662751
test_recall        0.862572
train_recall       0.869936
test_f1            0.745949
train_f1           0.752084
test_roc_auc       0.803156
train_roc_auc      0.814571
threshold          0.353939
dtype: float64

In [29]:
df_grp = (df_result
          .groupby(['algorithm'])
          .mean()
          .reset_index()
          .sort_values(by='test_roc_auc',
                       ascending=False
                      )
         )

  .mean()


In [30]:
df_grp.columns

Index(['algorithm', 'fit_time', 'score_time', 'test_precision',
       'train_precision', 'test_recall', 'train_recall', 'test_f1', 'train_f1',
       'test_roc_auc', 'train_roc_auc', 'threshold'],
      dtype='object')

In [34]:
for index, df_row in df_grp.iterrows():
    print('Algorith :', df_row['algorithm'])
    print(df_row[['test_precision', 'test_recall', 'test_f1', 'test_roc_auc']])
    print('*************** \n')


Algorith : random
test_precision    0.738555
test_recall       0.878118
test_f1           0.801964
test_roc_auc      0.884198
Name: 3, dtype: object
*************** 

Algorith : ada
test_precision    0.646652
test_recall       0.891828
test_f1           0.749283
test_roc_auc      0.812638
Name: 0, dtype: object
*************** 

Algorith : logistic
test_precision    0.657619
test_recall       0.862572
test_f1           0.745949
test_roc_auc      0.803156
Name: 1, dtype: object
*************** 

Algorith : tree
test_precision    0.522464
test_recall       0.956309
test_f1           0.665887
test_roc_auc      0.699343
Name: 4, dtype: object
*************** 

Algorith : naive
test_precision    0.494626
test_recall            1.0
test_f1           0.661873
test_roc_auc      0.685112
Name: 2, dtype: object
*************** 

