#### **This notebook tests the different features of user classifier**

#### **This script test each feature result in replier classifier**

In [1]:
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
import matplotlib.colors as pltc

import importlib

#### packages
import helper.visualization as vz_hp
import config.config as config_hp
import helper.stat_helper as stat_hp

#### **Load replier features**

In [2]:
all_feature = './../data/RQ3_replier_classifier_features.csv'

df_stat = pd.read_csv(all_feature)

#### **Functions**

In [9]:
def test(df_test, all_result, feat):
    '''
    Runs 10 fold cross validation in 10 different dataset with
    different algorithm
    :param df_stat: Dataframe with features
    :param alo_list: List of algorithms to try
    :param filename: Filename to save the result

    '''
    df_1 = df_test.loc[df_test['replier_label'] == 1]
    df_0 = df_test.loc[df_test['replier_label'] == 0]
    
    for i in range(0,10):
        df_sample = df_0.sample(len(df_1), random_state=i)

        df_0 = df_0.loc[~df_0['replier_userid'].isin(
            df_sample['replier_userid']
        )]

        df_all = df_1.append(df_sample)

        total_col = len(df_all.columns) - 2

        df_result = \
    stat_hp.run_model_with_best_threshold(df_test,
                      columns_not_include=[],
                      model_type='random', 
                      y_column = 'replier_label',
                      filename=None,
                     )

        df_result['feature'] = feat
        df_result['index'] = i
        
        all_result.append(df_result)
        
        print(i)
        
    return all_result

In [10]:
def individual_feature_test(df_stat, filename):
    '''
    Groups individual feature set and runs the 10 fold CV test 
    for each feature set
    :param df_stat: Feature dataframe
    :param filename: file to save the result
    '''
    diff_features = {'Delay': 'diff_min',
                     'Engagement': 'like_count|retweet_count|reply_count',
                     'Similarity': 'cosine',
                     'Entites': 'mention|hashtags|url',

                     'Followers rate': 'followers_ratio',
                     'Following rate': 'following_ratio',
                     'Activity rate': 'activity_rate',
                     'Age': 'age'
                    }

    all_result = []
    for feature in diff_features:
        print(f'********** {feature} ***********')
        feat = diff_features[feature] + '|replier_label|replier_userid'

        df_filtered = df_stat.filter(regex=feat, axis=1)

        print(df_filtered.columns)

        all_result = test(df_filtered, all_result, feature)
        
        # break

    (pd.concat(all_result, ignore_index=True)
    ).to_pickle(filename)

#### **Run function**

In [None]:
filename = './../results/different_feature_result_replier_classifier.pkl.gz'
individual_feature_test(df_stat, filename)

#### **Load results**

In [19]:
filename = './../results/data/different_feature_result_replier_classifier.pkl.gz'

df_diff_feat = pd.read_pickle(filename)

In [22]:
df_diff_feat.groupby('feature').mean()

Unnamed: 0_level_0,total_data,mean_precision,mean_recall,mean_f1,mean_auc,index
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Activity rate,1.0,0.605,0.63,0.615,0.65,4.5
Age,1.0,0.581,0.666,0.611,0.663,4.5
Delay,9.0,0.577,0.602,0.587,0.62,4.5
Engagement,27.0,0.573,0.574,0.531,0.587,4.5
Entites,27.0,0.635,0.504,0.537,0.631,4.5
Followers rate,1.0,0.555,0.519,0.536,0.564,4.5
Following rate,1.0,0.546,0.52,0.532,0.56,4.5
Similarity,9.0,0.856,0.841,0.848,0.929,4.5
