#### **This notebook test the model with each features set**

In [1]:
import pandas as pd
import numpy as np
import warnings
import os

import importlib

#### packages
import helper.visualization as viz_hp
import helper.stat_helper as stat_hp
import config.config as config_hp

#### **Load Files**

In [2]:
importlib.reload(stat_hp)

import importlib

tweet_features = './../data/tweet_classifier_features.pkl.gz'

df_all_stat = pd.read_pickle(tweet_features)

In [3]:
print('Features :', len(df_all_stat.columns)-2)

Features : 99


In [4]:
print('Feature names')
df_all_stat.columns

Feature names


Index(['std_retweet_count', 'range_reply_count', 'entropy_num_hashtags',
       'std_num_url', 'kurtosis_like_count', 'range_like_count',
       'skew_like_count', 'skew_reply_count', '50%_mention_count',
       'kurtosis_cosine',
       ...
       'max_like_count', 'mean_diff_min', 'entropy_retweet_count',
       'mean_retweet_count', 'min_cosine', 'max_num_hashtags',
       'entropy_mention_count', 'range_num_hashtags', 'entropy_cosine',
       'range_retweet_count'],
      dtype='object', length=101)

#### **Tweet Classifier: Model test different feature sets**

In [26]:
importlib.reload(stat_hp)
importlib.reload(config_hp)

diff_features = {'Reply Time': 'diff_min',
                 'Num of reply targeted tweet got': 'org_reply_count',
                 'Engagement metric': 'like_count|retweet_count|reply_count',
                 'Reply similarity': 'cosine',
                 'Entites': 'mention_count|num_hashtags|num_url',
                 'Posted Retweet': 'posted_retweet_count', 
                 'Posted Like': 'posted_like_count'
                }

print('Running the model')

all_result = []
for feature in diff_features:
    print('Running for feature :', feature)
    #Filter the set of attributes
    feat = diff_features[feature] + '|tweet_label|poster_tweetid'
        
    df_filtered = df_all_stat.filter(regex=feat, axis=1)
    
    total_col = len(df_filtered.columns)
    
    print(total_col)

    #Run the model
    df_result = \
    stat_hp.run_model_with_best_threshold(df_filtered,
                      columns_not_include=[],
                      model_type='random', 
                      y_column = 'tweet_label',
                     )

    df_result['feature'] = feature
    all_result.append(df_result)


filename = './../results/tweet_classifier_different_feature_result.pkl.gz'

(pd.concat(all_result, ignore_index=True)
).to_pickle(filename)

Running the model
Running for feature : Reply Time
14

 **** random ****
Running Random Forest
Running for feature : Num of reply targeted tweet got
3

 **** random ****
Running Random Forest
Running for feature : Engagement metric
41

 **** random ****
Running Random Forest
Running for feature : Reply similarity
14

 **** random ****
Running Random Forest
Running for feature : Entites
38

 **** random ****
Running Random Forest
Running for feature : Posted Retweet
3

 **** random ****
Running Random Forest
Running for feature : Posted Like
3

 **** random ****
Running Random Forest


#### **Load result**

In [5]:
df_diff_feat = pd.read_pickle(
    './../results/tweet_classifier_different_feature_result.pkl.gz'
)

In [6]:
df_diff_feat.columns

Index(['fit_time', 'score_time', 'estimator', 'test_precision',
       'train_precision', 'test_recall', 'train_recall', 'test_f1', 'train_f1',
       'test_roc_auc', 'train_roc_auc', 'threshold', 'algorithm', 'feature'],
      dtype='object')

In [18]:
df_diff_feat['feature'].unique()

array(['Reply Time', 'Num of reply targeted tweet got',
       'Engagement metric', 'Reply similarity', 'Entites',
       'Posted Retweet', 'Posted Like'], dtype=object)

In [20]:
replace_values = {'Reply Time': 'Delay',
                 'Num of reply targeted tweet got': 'reply_count',
                 'Engagement metric': 'Engagement',
                 'Reply similarity': 'Similarity',
                 'Entites': 'Entites',
                 'Posted Retweet': 'retweet_count', 
                 'Posted Like': 'like_count'
                }
df_diff_feat['feature'] = df_diff_feat['feature'].replace(replace_values)

In [21]:
columns = [
           'test_precision', 
           'test_recall', 
           'test_f1', 
           'test_roc_auc'
          ]
df_diff_feat.groupby('feature')[columns].mean().sort_values(by='test_f1',
                                                                 ascending=False
                                                                )[columns]

Unnamed: 0_level_0,test_precision,test_recall,test_f1,test_roc_auc
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Engagement,0.694851,0.869892,0.772459,0.841409
Similarity,0.540483,0.962747,0.692058,0.684654
Entites,0.522257,0.954472,0.675083,0.651249
reply_count,0.509214,0.994309,0.673503,0.59069
Delay,0.516997,0.961727,0.672278,0.665159
like_count,0.494626,1.0,0.661873,0.528683
retweet_count,0.494626,1.0,0.661873,0.541797
