#### **This notebook test for different algorithm results for tweet classifier**

In [1]:
import pandas as pd

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as viz_hp
import config.config as config_hp
import matplotlib.pyplot as plt
import helper.stat_helper as stat_hp

In [2]:
importlib.reload(stat_hp)

import importlib

importlib.reload(config_hp)

config = config_hp.config()
stat = config['STATS']

final_stat = stat['final_stat']

df_all_stat = pd.read_pickle(final_stat)

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import AdaBoostClassifier

def run_model(df,
              columns_not_include=['list_age'],
              model_type='random', 
              pca=False,
              y_column = 'tweet_label',
              filename=None,
              just_f1=False,
              find_threshold=True
             ):
    '''
    Trains the model and prints the result
    :param df: Dataframe
    :param model_type: Type of model
    :param pca: Whether to do PCA or not
    :param columns_not_include: columns to not include
    '''
    print(f'\n **** {model_type} ****')
    
    import pickle

    model_filename =' k' #'user_classifier_without_pca_ran.sav'
    
    columns_not_include.extend(
        ['poster_tweetid','tweet_label', 'replier_userid', 'replier_label'])
    
    columns_to_keep = list(set(df.columns) - set(columns_not_include))

    X = df[columns_to_keep]
    y = df[y_column]
  
    if 'mean_tensor' in columns_to_keep:
        t = df['mean_tensor'].tolist()
        t = torch.stack(t)
        t = t[:, :100]
        
        columns_to_keep.remove('mean_tensor')
        
        z = df[columns_to_keep]
        k = torch.tensor(z.values)
        X = torch.cat((t, k), dim=1)
    else:
        print(df[y_column].unique())
        X = df[columns_to_keep]
        
    #PCA 
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    indices = df.index
    
    if pca == True:
        print('here')
        print(len(columns_to_keep))
        pca = PCA()

        # Fit the PCA object to the data and transform the data
        X = pca.fit_transform(X)
        print('After PCA shape ', X.shape)

    X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(X,
                                                                                     y,
                                                                                     indices,
                                                        random_state=104, 
                                                        stratify=y,
                                                        test_size=0.20, 
                                                        shuffle=True)

    print('Xtrain: ', len(X_train))
    print('Xtrain shape: ', X_train.shape)
    print('Xtest: ', len(X_test))
    print('Ytrain: ', len(y_train))
    print('Ytest: ', len(y_test))

    if model_type == 'logistic':
        model = LogisticRegression(random_state=0)
    elif model_type == 'random':
        print('Running Random Forest')
        model = RandomForestClassifier(n_estimators=100, 
                                   random_state=42
                                  )
    elif model_type == 'ada':
        model = AdaBoostClassifier(n_estimators=100,
                                 algorithm="SAMME", random_state=0)
    elif model_type == 'tree':
        model = tree.DecisionTreeClassifier()
    elif model_type == 'naive':
        from sklearn.naive_bayes import GaussianNB
        model = GaussianNB()

    model.fit(X_train, y_train)
            
            # pickle.dump(model, open(model_filename, 'wb'))
    
    print(model.score(X_train, y_train))

    y_pred = model.predict(X_test)

    result = classification_report(y_test, y_pred, 
                                   labels=[0,1])
    prf_1 = precision_recall_fscore_support(y_test, 
                                y_pred,
                                average='binary',
                                pos_label=1
                               )    
    print(result)
    prf_0 = precision_recall_fscore_support(y_test, 
                                y_pred,
                                average='binary',
                                pos_label=0
                               )    
    cm = confusion_matrix(y_test, y_pred)

    print(cm)
    
    #Cross validation
    scoring = {'precision', 
               'recall',
               'f1',
               'roc_auc'
              }

    scores = cross_validate(model, X, y, scoring=scoring, cv=10)
    mean_score_f1 = round(scores['test_f1'].mean(), 2)
    std_score_f1 = round(scores['test_f1'].std(), 2)
    
    mean_score_precision = round(scores['test_precision'].mean(), 2)
    std_score_precision = round(scores['test_precision'].std(), 2)
    
    mean_score_recall = round(scores['test_recall'].mean(), 2)
    std_score_recall = round(scores['test_recall'].std(), 2)
    
     
    mean_score_auc = round(scores['test_roc_auc'].mean(), 2)
    std_score_auc = round(scores['test_roc_auc'].std(), 2)
    
    print(f'Cross validation: mean {mean_score_f1} f1 with a standard deviation of {std_score_f1}')
    
    print(f'Cross validation: mean {mean_score_precision} precision with a standard deviation of {std_score_precision}')
    
    print(f'Cross validation: mean {mean_score_recall} recall with a standard deviation of {std_score_recall}')
    
        
    print(f'Cross validation: mean {mean_score_auc} auc with standard deviation of {std_score_auc}')
    
    final_score = {
            'mean_f1': mean_score_f1,
            'mean_precision': mean_score_precision,
            'mean_recall': mean_score_recall,
            'mean_auc': mean_score_auc
    }
        
    #ROC curve
    lr_probs = model.predict_proba(X_test)
    
    fpr, tpr, thresholds = roc_curve(y_test, lr_probs[:, 1])
    
    # Compute the AUC score
    roc_auc = auc(fpr, tpr)

    if filename != None:
        fig.savefig(f'{filename}')
    
    from sklearn.metrics import precision_recall_curve

    # y_true and y_scores are the true labels and predicted scores, respectively
    precision, recall, thresholds = precision_recall_curve(y_test,
                                                        lr_probs[:, 1])
    df_pred = df.loc[indices_test]
    df_pred['pred'] = y_pred
    
    return model, df_pred, roc_auc, prf_1, prf_0, mean_score_f1, std_score_f1, final_score

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc

from scipy.stats import ks_2samp
from scipy.stats import mannwhitneyu
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

from sklearn.model_selection import cross_validate
from sklearn import tree

In [9]:
algorithms = ['logistic', 'ada', 'random', 'tree', 'naive']
all_results = []
for algo in algorithms:

    model, df_return, roc, prf_1, prf_0, mean_score_f1, std_score_f1, final_score = run_model(df_all_stat,
                                  columns_not_include=[],
                                  model_type=algo, 
                                  pca=False,
                                  y_column = 'tweet_label',
                                  filename=None,
                                 just_f1=False,
                                 find_threshold=False)
    
    all_results.append([algo, final_score['mean_f1'],
                        final_score['mean_precision'],
                        final_score['mean_recall'],
                        final_score['mean_auc']
                       ]
                      )
    
(pd.DataFrame(data=all_results,
              columns=['algorithm', 'mean_f1',
                       'mean_precision', 'mean_recall',
                       'mean_auc'
                      ]
             )
).to_pickle('./data/tweet_classifier_different_algorithm.pkl.gz')


 **** logistic ****
[1 0]
Xtrain:  6252
Xtrain shape:  (6252, 99)
Xtest:  1564
Ytrain:  6252
Ytest:  1564


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7404030710172744
              precision    recall  f1-score   support

           0       0.73      0.74      0.74       790
           1       0.73      0.72      0.73       774

    accuracy                           0.73      1564
   macro avg       0.73      0.73      0.73      1564
weighted avg       0.73      0.73      0.73      1564

[[585 205]
 [213 561]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross validation: mean 0.72 f1 with a standard deviation of 0.04
Cross validation: mean 0.73 precision with a standard deviation of 0.06
Cross validation: mean 0.71 recall with a standard deviation of 0.06
Cross validation: mean 0.79 auc with standard deviation of 0.06

 **** ada ****
[1 0]
Xtrain:  6252
Xtrain shape:  (6252, 99)
Xtest:  1564
Ytrain:  6252
Ytest:  1564
0.7496801023672425
              precision    recall  f1-score   support

           0       0.73      0.71      0.72       790
           1       0.72      0.73      0.72       774

    accuracy                           0.72      1564
   macro avg       0.72      0.72      0.72      1564
weighted avg       0.72      0.72      0.72      1564

[[564 226]
 [206 568]]
Cross validation: mean 0.71 f1 with a standard deviation of 0.06
Cross validation: mean 0.71 precision with a standard deviation of 0.06
Cross validation: mean 0.71 recall with a standard deviation of 0.09
Cross validation: mean 0.8 auc with standard deviatio

In [10]:
df_result = pd.read_pickle(
    './data/tweet_classifier_different_algorithm.pkl.gz'
)

df_result[[
    'algorithm', 
    'mean_precision', 
    'mean_recall',
    'mean_f1', 
    'mean_auc'
]].sort_values(by='mean_auc',
               ascending=False
              )

Unnamed: 0,algorithm,mean_precision,mean_recall,mean_f1,mean_auc
2,random,0.76,0.76,0.76,0.84
1,ada,0.71,0.71,0.71,0.8
0,logistic,0.73,0.71,0.72,0.79
4,naive,0.71,0.2,0.31,0.68
3,tree,0.66,0.65,0.66,0.66
