#### **This notebook train and test replier model in different campaign**

In [5]:
import pandas as pd
import numpy as np

import importlib

#### packages
import helper.strategy_helper as st_hp
import helper.visualization as vz_hp
import config.config as config_hp

import gc
import os

#### **Testing whether all the data are present**

In [6]:
list_camp = [
    'saudi_arabia_112019',
    'serbia_022020',
    'turkey_052020',
    'egypt_022020',
    'honduras_022020',
    'remain'
]

replier_camp_features = './../data'

for camp in list_camp:
    campaign_feat = os.path.join(replier_camp_features, 
                                 f'replier_classification_{camp}_features.pkl.gz'
                                )
    if os.path.exists(campaign_feat):
        continue
    else:
        print(campaign_feat , ' Not found')

    del df
    gc.collect()

#### **Train Test Functions**

In [7]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_fscore_support
import helper.strategy_helper as st_hp
import helper.stat_helper as stat_hp
import pickle
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import TunedThresholdClassifierCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RepeatedStratifiedKFold

importlib.reload(stat_hp)

def train_model(df_filtered,
                y_column,
                campaign
               ):
    '''
    Train the model
    :param df_filtered: Dataframe with features
    :param y_column: target column
    :param campaign: Name of campaign
    '''
    total_data = len(df_filtered)

    #Data
    df_filtered['replier_label'] = df_filtered['replier_label'].astype(int)
    columns_not_include = ['poster_tweetid', 'tweet_label', 
                           'replier_userid', 'replier_label'
                          ]
    
    columns_to_keep = list(set(df_filtered.columns) - set(columns_not_include))

    df_1 = df_filtered.loc[df_filtered['replier_label'] == 1]
    df_0 = df_filtered.loc[df_filtered['replier_label'] == 0]

    #run training for 10 different balanced datasets
    all_df = []
    for i in range(10):
        df_sample = df_0.sample(len(df_1), random_state=1)
    
        df_0 = df_0.loc[~df_0['replier_userid'].isin(
            df_sample['replier_userid']
        )]
    
        df_all = pd.concat([df_1, df_sample],
                           ignore_index=True
                          )
    
        total_col = len(df_filtered.columns) - 2
    
        df_result = \
        stat_hp.run_imbalanced_model(df_filtered,
                          columns_not_include=[],
                          model_type='random', 
                          y_column = 'replier_label',
                          filename=None,
                         )
    
        df_result['campaign'] = campaign
        df_result['index'] = i

        all_df.append(df_result)

    df_result = pd.concat(all_df,
                          ignore_index=True
                         )
    return df_result


def test(df_test, model, threshold, y_column):
    '''
    Test the model
    '''
    
    df_test['replier_label'] = df_test['replier_label'].astype(int)
    
    columns_not_include = ['poster_tweetid', 'tweet_label', 
                           'replier_userid', 'replier_label'
                          ]
    
    columns_to_keep = list(set(df_test.columns) - set(columns_not_include))

    X_test = df_test[columns_to_keep]
    y_test = df_test[y_column]
    
    probabilities = model.predict_proba(X_test)[:, 1]

    predictions = (probabilities >= threshold).astype(int)
    
    prf_1 = precision_recall_fscore_support(y_test, 
                                predictions,
                                average='binary',
                                pos_label=1
                               )    
    prf_0 = precision_recall_fscore_support(y_test, 
                                predictions,
                                average='binary',
                                pos_label=0
                               )
    
    roc_auc = roc_auc_score(y_test, probabilities)
    
    print('ROC :', roc_auc)

    result = [x for x in prf_1]
    result.extend([x for x in prf_0])
    result.extend([roc_auc])
    
    return result

In [8]:
def test_all_campaign():
    replier_camp_features = './../data/'

    # Load data that has trained models
    df_models = pd.read_pickle(
        './../data/replier_model_cross_camp_train.pkl.gz'
    )
    campaigns = df_models['campaign'].unique()
    all_result = []
    for camp in campaigns:
        df_camp = df_models.loc[
            df_models['campaign'] == camp
        ]
        max_f1_index = df_camp['test_f1'].idxmax()
        max_f1_row = df_camp.loc[max_f1_index]
        
        model = max_f1_row['estimator']
        threshold = model.best_threshold_
        
        for camp_test in campaigns:
            if camp_test == camp:
                continue
                
            print(camp , ' train : test ', camp_test)
            
            df_test = pd.read_pickle(
                f'{replier_camp_features}/replier_classification_{camp_test}_features.pkl.gz'
            )
            result = test(df_test, 
                          model, 
                          threshold, 
                          'replier_label'
                         )
            result.extend(['test', camp_test, 'train', camp])
            all_result.append(result)

    (pd.DataFrame(all_result,
                  columns=['precision_1', 'recall_1', 'f1_1', 'support_1',
                           'precision_0', 'recall_0', 'f1_0', 'support_0',
                           'auc',
                           'test', 'test_campaign', 
                           'train', 'train_campaign'
                          ]
                  )
    ).to_pickle(
        './../data/replier_model_train_cross_campaign.pkl.gz'
    )
    


In [9]:
def train_all_campaign():
    list_camp = [
        'saudi_arabia_112019',
        'serbia_022020',
        'turkey_052020',
        'egypt_022020',
        'honduras_022020',
        'remain'
    ]
    
    replier_camp_features = './../data'
    result =[]
    for campaign in list_camp:
        
        print(f'****** Train on {campaign} ************')
        
        df_train = pd.read_pickle(
            f'{replier_camp_features}/replier_classification_{campaign}_features.pkl.gz'
        )
        
        df_train.fillna(0, inplace = True)
        
        print(campaign)
        print(len(df_train.columns))
        
        df_result = train_model(df_train, 
                                'replier_label',
                                campaign, 
                               )
        result.append(df_result)

    (pd.concat(result,
               ignore_index=True
              )
    ).to_pickle('./../data/replier_model_cross_camp_train.pkl.gz')
            

In [10]:
# train_all_campaign()
# test_all_campaign()

#### **Load the result for campaign trained and tested on same campaign**

In [11]:
df_models = pd.read_pickle(
        './../data/replier_model_cross_camp_train.pkl.gz'
    )
columns = ['test_precision',
           'test_recall',
           'test_f1',
           # 'campaign'
          ]

print(df_models.columns)

df_grp_result = df_models.groupby('campaign')[columns].mean()

df_grp_result

Index(['fit_time', 'score_time', 'estimator', 'test_precision',
       'train_precision', 'test_recall', 'train_recall', 'test_f1', 'train_f1',
       'test_roc_auc', 'train_roc_auc', 'threshold', 'algorithm', 'campaign'],
      dtype='object')


Unnamed: 0_level_0,test_precision,test_recall,test_f1
campaign,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
egypt_022020,0.88401,0.98105,0.929679
honduras_022020,0.921575,0.986947,0.953075
remain,0.91445,0.985664,0.948691
saudi_arabia_112019,0.89922,0.980112,0.937703
serbia_022020,0.841366,0.963488,0.897961
turkey_052020,0.882405,0.970611,0.924195


#### **Result for trained on one campaign but tested on another campaign**

In [12]:
df_result = pd.read_pickle(
    './../data/replier_model_test_result_cross_campaign.pkl.gz'
)

In [13]:
campaigns = df_result['train_campaign'].unique()
print(campaigns)
for camp in campaigns:
    print(f' \n *** Trained on : {camp} **** \n')
    print(f'*** Test on: ****')
    df_temp = df_result.loc[df_result['train_campaign'] == camp]
    print(df_temp[['f1_1',
                   'test_campaign', 'train_campaign'
                  ]])


['saudi_arabia_112019' 'serbia_022020' 'turkey_052020' 'egypt_022020'
 'honduras_022020' 'remain']
 
 *** Trained on : saudi_arabia_112019 **** 

*** Test on: ****
       f1_1    test_campaign       train_campaign
0  0.794935    serbia_022020  saudi_arabia_112019
1  0.867761    turkey_052020  saudi_arabia_112019
2  0.865296     egypt_022020  saudi_arabia_112019
3  0.558044  honduras_022020  saudi_arabia_112019
4  0.900870           remain  saudi_arabia_112019
 
 *** Trained on : serbia_022020 **** 

*** Test on: ****
       f1_1        test_campaign train_campaign
5  0.537352  saudi_arabia_112019  serbia_022020
6  0.761450        turkey_052020  serbia_022020
7  0.538059         egypt_022020  serbia_022020
8  0.483117      honduras_022020  serbia_022020
9  0.497664               remain  serbia_022020
 
 *** Trained on : turkey_052020 **** 

*** Test on: ****
        f1_1        test_campaign train_campaign
10  0.905575  saudi_arabia_112019  turkey_052020
11  0.911500        serbia_02202