#### **This notebook test for different algorithm results for tweet classifier**

In [2]:
import pandas as pd

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as viz_hp
import config.config as config_hp
import matplotlib.pyplot as plt
import helper.stat_helper as stat_hp

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

from sklearn.metrics import precision_recall_fscore_support

from sklearn.model_selection import cross_validate
from sklearn import tree

#### **Load data**

In [3]:
importlib.reload(stat_hp)

import importlib

tweet_features = './../data/RQ2_tweet_classifier_features.csv'

df_all_stat = pd.read_csv(tweet_features)

In [4]:
print('No. of features: ', len(df_all_stat.columns)-2)

No. of features:  99


In [5]:
print('All features with target: \n', df_all_stat.columns)

All features with target: 
 Index(['std_retweet_count', 'range_reply_count', 'entropy_num_hashtags',
       'std_num_url', 'kurtosis_like_count', 'range_like_count',
       'skew_like_count', 'skew_reply_count', '50%_mention_count',
       'kurtosis_cosine',
       ...
       'max_like_count', 'mean_diff_min', 'entropy_retweet_count',
       'mean_retweet_count', 'min_cosine', 'max_num_hashtags',
       'entropy_mention_count', 'range_num_hashtags', 'entropy_cosine',
       'range_retweet_count'],
      dtype='object', length=101)


#### **Code to train model**

In [6]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier

def run_model(df,
              columns_not_include=[],
              model_type='random', 
              y_column = 'tweet_label',
              filename=None,
             ):
    '''
    Trains the model and prints the result
    :param df: Dataframe
    :param model_type: Type of model
    :param pca: Whether to do PCA or not
    :param columns_not_include: columns to not include
    '''
    print(f'\n **** {model_type} ****')
    
    ### Remove unnecessary columns
    import pickle

    model_filename = filename
    
    columns_not_include.extend(
        ['poster_tweetid','tweet_label', 'replier_userid', 'replier_label'])
    
    columns_to_keep = list(set(df.columns) - set(columns_not_include))

    X = df[columns_to_keep]
    y = df[y_column]
  
    ### Choose model
    if model_type == 'logistic':
        model = LogisticRegression(random_state=0)
    elif model_type == 'random':
        print('Running Random Forest')
        model = RandomForestClassifier(n_estimators=100, 
                                   random_state=42
                                  )
    elif model_type == 'ada':
        from sklearn.ensemble import AdaBoostClassifier
        model = AdaBoostClassifier(n_estimators=100,
                                 algorithm="SAMME", 
                                   random_state=0
                                  )
    elif model_type == 'tree':
        model = tree.DecisionTreeClassifier()
    elif model_type == 'naive':
        from sklearn.naive_bayes import GaussianNB
        model = GaussianNB()
    
    ### Choose scoring function
    from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score

    # Creating a dictionary of scorers
    scoring = {
        'precision': make_scorer(precision_score, average='binary'),
        'recall': make_scorer(recall_score, average='binary'),
        'f1': make_scorer(f1_score, average='binary'),
        'roc_auc': make_scorer(roc_auc_score, needs_proba=True)
    }

    cv_scores = [
        "train_precision",
        "test_precision",
        "train_recall",
        "test_recall",
        "train_f1",
        "test_f1",
        "train_roc_auc",
        "test_roc_auc",
    ]

    from sklearn.model_selection import TunedThresholdClassifierCV
    from sklearn.pipeline import make_pipeline
    from sklearn.model_selection import RepeatedStratifiedKFold
    from sklearn.metrics import f1_score

    #Creates a pipeline for training and testing
    #Standardize the features
    #Stratified5Fold cross validation
    #F1 as scoring function
    #TunedThresholdClassifierCV: for each cross-validation
    #this returns all the scores in cross validation
    #as well as the model trained in all data tuned with best threshold 
    #during cross-validation
    
    model = make_pipeline(StandardScaler(), model)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=5, random_state=42)
    tuned_model = TunedThresholdClassifierCV(estimator=model,
                                             scoring='f1',
                                             store_cv_results = True,
                                             n_jobs=-1
                                            )

    cv_results_tuned_model = pd.DataFrame(
        cross_validate(
            tuned_model,
            X,
            y,
            scoring=scoring,
            cv=cv,
            return_train_score=True,
            return_estimator=True,
        )
    )
   
    from sklearn.metrics import f1_score

    decision_threshold = pd.Series(
        [est.best_threshold_ for est in cv_results_tuned_model["estimator"]],
    )
    cv_results_tuned_model['threshold'] = decision_threshold
    
    cv_results_tuned_model['algorithm'] = model_type
    
    return cv_results_tuned_model

#### **Run for all algorithms**

In [None]:
algorithms = ['logistic', 'ada', 'random', 'tree', 'naive']
all_results = []
for algo in algorithms:
    df_result = run_model(df_all_stat,
                   columns_not_include=['list_age'],
                   model_type=algo, 
                   y_column = 'tweet_label',
                   filename=None,
                  )
    
    all_results.append(df_result)
    
(pd.concat(all_results, ignore_index=True)
).to_pickle('./../results/tweet_classifier_different_algorithm.pkl.gz')

#### **Load the results**

In [3]:
df_result = pd.read_pickle(
    './../results/tweet_classifier_different_algorithm.pkl.gz'
)

In [19]:
columns = ['test_precision', 'test_recall',
           'test_f1', 'test_roc_auc',
           'algorithm'
          ]
df_grp = (df_result[columns]
          .groupby(['algorithm'])
          .mean()
          .reset_index()
          .sort_values(by='test_roc_auc',
                       ascending=False
                      )
         )

In [20]:
df_grp.columns

Index(['algorithm', 'test_precision', 'test_recall', 'test_f1',
       'test_roc_auc'],
      dtype='object')

In [24]:
df_grp

Unnamed: 0,algorithm,test_precision,test_recall,test_f1,test_roc_auc
3,random,0.738555,0.878118,0.801964,0.884198
0,ada,0.646652,0.891828,0.749283,0.812638
1,logistic,0.657619,0.862572,0.745949,0.803156
4,tree,0.522464,0.956309,0.665887,0.699343
2,naive,0.494626,1.0,0.661873,0.685112


In [4]:
def print_standard_error(values, label):
    '''
    Calculates the standard error
    :param values: List of values to calculate the
    standard deviation and mean
    :param label: What is the label for values

    :return mean_values: Mean of values
    :return std_values: Standard deviation from mean
    '''
    import numpy as np
    import matplotlib.pyplot as plt
    from scipy import stats
    
    mean_values = np.mean(values)
    
    # Standard deviation as error bars
    std_values = np.std(values)
    error = std_values/(np.sqrt(len(values)))

    print(f"Mean {label}: {mean_values:.3f} ± standard error {error}")

    return mean_values, std_values

In [5]:
df_grp = (df_result
          .groupby(['algorithm'])
         )

for grp, df_values in df_grp:
    print('Algorithm :', grp[0])
    mean_precision, std_prec = print_standard_error(df_values['test_precision'],
                                                    'precision'
                                                   )

    mean_recall , std_recall = print_standard_error(df_values['test_recall'],
                                                    'recall'
                                                   )
    mean_f1 , std_f1 = print_standard_error(df_values['test_f1'],
                                            'f1'
                                           )
    mean_auc , std_auc = print_standard_error(df_values['test_roc_auc'],
                                              'AUC'
                                             )

    print('\n ******************** \n\n')

Algorithm : ada
Mean precision: 0.647 ± standard error 0.002208958853097435
Mean recall: 0.892 ± standard error 0.0033999301346241597
Mean f1: 0.749 ± standard error 0.0009829964361432405
Mean AUC: 0.813 ± standard error 0.0011568361817921474

 ******************** 


Algorithm : logistic
Mean precision: 0.658 ± standard error 0.0019474902940183534
Mean recall: 0.863 ± standard error 0.0030967212309957674
Mean f1: 0.746 ± standard error 0.0009926883617619366
Mean AUC: 0.803 ± standard error 0.0012178880279946317

 ******************** 


Algorithm : naive
Mean precision: 0.495 ± standard error 1.8281269613566455e-05
Mean recall: 1.000 ± standard error 0.0
Mean f1: 0.662 ± standard error 1.6364951812737582e-05
Mean AUC: 0.685 ± standard error 0.0017211828767611777

 ******************** 


Algorithm : random
Mean precision: 0.739 ± standard error 0.0026694748875367644
Mean recall: 0.878 ± standard error 0.0024688878343244593
Mean f1: 0.802 ± standard error 0.001163087918525277
Mean AUC: