#### **This note book tests the api**

In [2]:
import pandas as pd
import numpy as np

import importlib

#### packages
import helper.strategy_helper as st_hp
import helper.visualization as vz_hp
import config.config as config_hp

#### **Train Test Functions**

In [19]:
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_fscore_support
import helper.strategy_helper as st_hp

def train_model(df_filtered,
                y_column,
                campaign,
                result
               ):
    '''
    Train the model
    :param df_filtered: Dataframe with features
    :param y_column: target column
    :param campaign: Name of campaign
    :param result: Global result list

    :return result, model, best_threshold
    '''
    total_data = len(df_filtered)
    
    df_filtered['tweet_label'] = df_filtered['tweet_label'].astype(int)
    columns_not_include = ['poster_tweetid',
                           'tweet_label', 
                          ]
    
    columns_to_keep = list(set(df_filtered.columns) - set(columns_not_include))

    X = df_filtered[columns_to_keep]
    y = df_filtered[y_column]
    
    #model
    pos_label = 1
    scorer = make_scorer(f1_score, pos_label=pos_label)
    
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import TunedThresholdClassifierCV
    from sklearn.pipeline import make_pipeline
    from sklearn.model_selection import RepeatedStratifiedKFold
    from sklearn.metrics import f1_score
    
    model = RandomForestClassifier(n_estimators=100, 
                                   random_state=42
                                  )
    model = make_pipeline(StandardScaler(), model)
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=1, random_state=42)
    tuned_model = TunedThresholdClassifierCV(estimator=model,
                                             scoring=scorer,
                                             store_cv_results = False,
                                             n_jobs=-1
                                            )
    
    tuned_model.fit(X, y)
    best_f1 = tuned_model.best_score_
    best_threshold = tuned_model.best_threshold_
    
    
    print(f'Best f1 for {campaign} :', best_f1)
    print(f'Best threshold for {campaign} :',best_threshold)
    
    result.append([campaign, total_data, 
                   best_f1, best_threshold,
                   None, 'train', campaign
                  ])
    
    return result, tuned_model, best_threshold


def test(df_test, model, threshold, y_column):
    '''
    Test the model
    :param df_test: Dataframe with test data
    :param model: Trained model
    :param threshold: Threshold to use for the classification
    :param y_column: target column
    '''
    
    df_test['tweet_label'] = df_test['tweet_label'].astype(int)
    
    columns_not_include = ['poster_tweetid', 'tweet_label', 
                           'replier_userid', 'replier_label'
                          ]
    
    columns_to_keep = list(set(df_test.columns) - set(columns_not_include))

    X_test = df_test[columns_to_keep]
    y_test = df_test[y_column]
    
    probabilities = model.predict_proba(X_test)[:, 1]

    predictions = (probabilities >= threshold).astype(int)
    
    y_pred = model.predict(X_test)

    prf_1 = precision_recall_fscore_support(y_test, 
                                predictions,
                                average='binary',
                                pos_label=1
                               )    
    
    prf_0 = precision_recall_fscore_support(y_test, 
                                predictions,
                                average='binary',
                                pos_label=0
                               )
    
    roc_auc = roc_auc_score(y_test, probabilities)
    
    print('ROC :', roc_auc)
    
    return prf_1, prf_0, roc_auc

#### **Get top 5 campaigns**

In [9]:
#All targeted tweets and their replies
balanced_pos_conversation = './../data/balanced_positive_conversation.pkl.gz'

df_pos = pd.read_pickle(balanced_pos_conversation)

df_grp_camp = (df_pos
               .groupby(['campaign'])['poster_tweetid']
               .nunique(dropna=False)
               .to_frame('count')
               .reset_index()
               .sort_values(by=['count'],
                            ascending=False
                           )
               
              )

In [10]:
df_grp_camp['campaign'].head()

20          serbia_022020
19    saudi_arabia_112019
23          turkey_052020
8            egypt_022020
18        sa_eg_ae_022020
Name: campaign, dtype: object

In [11]:
list_campaign = df_grp_camp['campaign'].head().tolist()
list_campaign 

['serbia_022020',
 'saudi_arabia_112019',
 'turkey_052020',
 'egypt_022020',
 'sa_eg_ae_022020']

#### **Train test the models**

In [None]:
import helper.stat_helper as stat_hp
importlib.reload(stat_hp)
import pickle
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import TunedThresholdClassifierCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import f1_score

camp_stats = './../results'

list_campaign = df_grp_camp.head()['campaign'].tolist()
result =[]
list_campaign.extend(['remain'])

for campaign in list_campaign:
    
    print(f'****** Train on {campaign} ************')
    
    df_train = pd.read_pickle(f'{camp_stats}/tweet_classifier_{campaign}_features.pkl.gz')
    
    print(campaign)
    
    result, model, threshold = train_model(df_train, 'tweet_label',
                                           campaign, result)
    
    with open(f'./../results/tweet_classifier_{campaign}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

    
    for camp_test in list_campaign:
        if campaign == camp_test:
            continue
            
        print(f'********* Test on {camp_test} *******************')
        df_test = pd.read_pickle(f'{camp_stats}/tweet_classifier_{camp_test}_features.pkl.gz')
        
        total_data = len(df_test)
        
        print(df_test.loc[df_test['tweet_label'] == 1]['poster_tweetid'].nunique())
        print(df_test.loc[df_test['tweet_label'] == 0]['poster_tweetid'].nunique())
        print(df_test.info())
        print(df_test['poster_tweetid'].nunique())
        
        # test
        prf_1, prf_0, roc_auc = test(df_test, model,
                                     threshold, 'tweet_label'
                                    )
        
        result.append([camp_test, total_data, 
                       prf_1[2], threshold, 
                       prf_0[2], 'test', campaign,
                      ])
        
        
(pd.DataFrame(result,
              columns=['campaign', 'total_data', 
                       'best_f1_1', 'threshold', 
                       'best_f1_0', 'type',
                       'trainded_on'
                      ]
              )
).to_pickle('./../results/tweet_classifier_train_test_cross_campaign.pkl.gz')

#### **Load the result**

In [3]:
 print(f'********* Test on remaining *******************')
df_test = pd.read_pickle('./../results/tweet_classifier_train_test_cross_campaign.pkl.gz')

print(df_test.columns)

********* Test on remaining *******************
Index(['campaign', 'total_data', 'best_f1_1', 'threshold', 'best_f1_0', 'type',
       'trainded_on'],
      dtype='object')


#### **Trained and tested on same campaign**

remain == Others

In [4]:
df_train = df_test.loc[(df_test['type'] == 'train')]

df_train[['campaign', 'best_f1_1', 'trainded_on' ]]

Unnamed: 0,campaign,best_f1_1,trainded_on
0,serbia_022020,0.854742,serbia_022020
6,saudi_arabia_112019,0.764838,saudi_arabia_112019
12,turkey_052020,0.747362,turkey_052020
18,egypt_022020,0.657902,egypt_022020
24,sa_eg_ae_022020,0.732042,sa_eg_ae_022020
30,remain,0.74862,remain


#### **Trained on one campaign and tested on other**

##### **Serbia**

In [7]:
df_train = df_test.loc[(df_test['type'] == 'test') & \
                       (
    df_test['trainded_on'] == 'serbia_022020')]

df_train[['campaign', 'best_f1_1', 'trainded_on' ]]

Unnamed: 0,campaign,best_f1_1,trainded_on
1,saudi_arabia_112019,0.547735,serbia_022020
2,turkey_052020,0.614035,serbia_022020
3,egypt_022020,0.564816,serbia_022020
4,sa_eg_ae_022020,0.527687,serbia_022020
5,remain,0.659993,serbia_022020


#### **saudi_arabia_112019**

In [8]:
df_train = df_test.loc[(df_test['type'] == 'test') & \
                       (
    df_test['trainded_on'] == 'saudi_arabia_112019')]

df_train[['campaign', 'best_f1_1', 'trainded_on' ]]

Unnamed: 0,campaign,best_f1_1,trainded_on
7,serbia_022020,0.553689,saudi_arabia_112019
8,turkey_052020,0.531314,saudi_arabia_112019
9,egypt_022020,0.632082,saudi_arabia_112019
10,sa_eg_ae_022020,0.74374,saudi_arabia_112019
11,remain,0.682728,saudi_arabia_112019


##### **turkey_052020**

In [14]:
df_train = df_test.loc[(df_test['type'] == 'test') & \
                       (
    df_test['trainded_on'] == 'turkey_052020')]

df_train[['campaign', 'best_f1_1', 'trainded_on' ]]

Unnamed: 0,campaign,best_f1_1,trainded_on
13,serbia_022020,0.615898,turkey_052020
14,saudi_arabia_112019,0.608516,turkey_052020
15,egypt_022020,0.632352,turkey_052020
16,sa_eg_ae_022020,0.651697,turkey_052020
17,remain,0.685958,turkey_052020


#### **egypt_022020**

In [15]:
df_train = df_test.loc[(df_test['type'] == 'test') & \
                       (
    df_test['trainded_on'] == 'egypt_022020')]

df_train[['campaign', 'best_f1_1', 'trainded_on' ]]

Unnamed: 0,campaign,best_f1_1,trainded_on
19,serbia_022020,0.436723,egypt_022020
20,saudi_arabia_112019,0.373896,egypt_022020
21,turkey_052020,0.36736,egypt_022020
22,sa_eg_ae_022020,0.386059,egypt_022020
23,remain,0.39906,egypt_022020


##### **sa_eg_ae_022020**

In [16]:
df_train = df_test.loc[(df_test['type'] == 'test') & \
                       (
    df_test['trainded_on'] == 'sa_eg_ae_022020')]

df_train[['campaign', 'best_f1_1', 'trainded_on' ]]

Unnamed: 0,campaign,best_f1_1,trainded_on
25,serbia_022020,0.47514,sa_eg_ae_022020
26,saudi_arabia_112019,0.580475,sa_eg_ae_022020
27,turkey_052020,0.571922,sa_eg_ae_022020
28,egypt_022020,0.60614,sa_eg_ae_022020
29,remain,0.482398,sa_eg_ae_022020


#### **Remain**

In [17]:
df_train = df_test.loc[(df_test['type'] == 'test') & \
                       (
    df_test['trainded_on'] == 'remain')]

df_train[['campaign', 'best_f1_1', 'trainded_on' ]]

Unnamed: 0,campaign,best_f1_1,trainded_on
31,serbia_022020,0.62957,remain
32,saudi_arabia_112019,0.596991,remain
33,turkey_052020,0.636496,remain
34,egypt_022020,0.561988,remain
35,sa_eg_ae_022020,0.521053,remain
