#### **This notebook train and test different campaign**

In [1]:
import pandas as pd
import numpy as np
import warnings
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc

import importlib
import os

#### packages
import helper.visualization as viz_hp
import config.config as config_hp

import copy
import helper.stat_helper as stat_hp

#### **Have campaign data in positive**

In [2]:
config = config_hp.config()

balanced_pos = config['BALANCED']['balanced_pos_conversation']

df_pos = pd.read_pickle(balanced_pos)

In [3]:
'campaign' in df_pos.columns

True

In [4]:
df_pos['poster_tweetid'].nunique()

3866

In [5]:
df_grp_pos = (df_pos
              .groupby(['poster_tweetid', 'campaign'])
              .first()
              .reset_index()
             )

In [6]:
df_grp_pos['poster_tweetid'].nunique()

3866

In [7]:
df_grp_pos['campaign'].unique()

array(['serbia_022020', 'saudi_arabia_112019', 'ecuador_082019',
       'turkey_052020', 'sa_eg_ae_022020', 'uae_082019', 'iran_201906',
       'honduras_022020', 'egypt_022020', 'iran_201901_1', 'iranian',
       'uganda_0621', 'china_082019', 'qatar_082020', 'iran_202012',
       'venezuela_201901', 'indonesia_022020', 'spain_082019',
       'egypt_uae_082019', 'cuba_082020', 'china_052020',
       'thailand_092020', 'MX_0621', 'Tanzania_0621', 'CNHU_0621',
       'Venezuela_0621', 'ira', 'russia_201901_1'], dtype=object)

In [8]:
df_new = df_pos.merge(df_grp_pos[['poster_tweetid', 'campaign']],
                      on='poster_tweetid',
                      how='left'
                     )

In [9]:
df_new.columns

Index(['tweet_text', 'conversation_id', 'replier_tweetid', 'replier_userid',
       'poster_userid', 'poster_tweetid', 'tweet_time', 'tweet_language',
       'replier_label', 'year', 'campaign_x', 'tweet_label', 'tweet_time_year',
       'common', 'id', 'username', 'campaign_y'],
      dtype='object')

In [10]:
df_new.drop(columns=['campaign_x'], inplace=True)
df_new.rename(columns={'campaign_y': 'campaign'},
              inplace=True)

In [11]:
len(df_new)

3446443

In [12]:
df_new.loc[df_new['campaign'].isnull()]

Unnamed: 0,tweet_text,conversation_id,replier_tweetid,replier_userid,poster_userid,poster_tweetid,tweet_time,tweet_language,replier_label,year,tweet_label,tweet_time_year,common,id,username,campaign


In [13]:
df_new_1 = df_new.loc[df_new['replier_label'] == 1]
df_new_0 = df_new.loc[df_new['replier_label'] == 0]

In [14]:
print('Positive :', df_new_1['replier_userid'].nunique())
print('Negative :', df_new_0['replier_userid'].nunique())

Positive : 7670
Negative : 874248


In [15]:
print('Positive :', df_new_1['campaign'].unique())
print('Negative :', df_new_0['campaign'].unique())

Positive : ['turkey_052020' 'saudi_arabia_112019' 'serbia_022020' 'uae_082019'
 'egypt_022020' 'sa_eg_ae_022020' 'honduras_022020' 'egypt_uae_082019'
 'iran_201906' 'iran_202012' 'iranian' 'thailand_092020' 'spain_082019'
 'iran_201901_1' 'china_052020' 'uganda_0621' 'venezuela_201901' 'MX_0621'
 'indonesia_022020' 'Tanzania_0621' 'ecuador_082019' 'china_082019'
 'cuba_082020' 'qatar_082020' 'ira' 'CNHU_0621' 'Venezuela_0621'
 'russia_201901_1']
Negative : ['turkey_052020' 'saudi_arabia_112019' 'serbia_022020' 'uae_082019'
 'egypt_022020' 'sa_eg_ae_022020' 'honduras_022020' 'egypt_uae_082019'
 'iran_201906' 'iran_202012' 'iranian' 'thailand_092020' 'spain_082019'
 'iran_201901_1' 'china_052020' 'uganda_0621' 'venezuela_201901' 'MX_0621'
 'indonesia_022020' 'Tanzania_0621' 'ecuador_082019' 'china_082019'
 'cuba_082020' 'qatar_082020' 'ira' 'CNHU_0621' 'Venezuela_0621'
 'russia_201901_1']


#### **Get top campagin with more positive cases**

In [16]:
df_1_grp = (df_new_1
            .groupby(['campaign'])['replier_userid']
            .nunique()
            .to_frame('count_1')
            .reset_index()
           )
df_0_grp = (df_new_0
            .groupby(['campaign'])['replier_userid']
            .nunique()
            .to_frame('count_0')
            .reset_index()
           )

df_merge_grp = df_1_grp.merge(df_0_grp,
                              on='campaign'
                             )

In [17]:
df_merge_grp = df_merge_grp.sort_values(by='count_1',
                                        ascending=False
                                       )

#### **Load statistics dataset**

In [18]:
importlib.reload(config_hp)

config = config_hp.config()
stat = config['USER_FEATURES']

final_feature = stat['all_feature']

df_stat = pd.read_pickle(final_feature)

In [19]:
len(df_stat)

794645

In [20]:
df_stat['replier_userid'].nunique()

794645

In [21]:
df_stat_camp_1 = df_stat.loc[df_stat['replier_label'] == 1]
df_stat_camp_0 = df_stat.loc[df_stat['replier_label'] == 0]

print('Total len 1: ', len(df_stat_camp_1))
print('Total len 0: ', len(df_stat_camp_0))
print('Positive :', df_stat_camp_1['replier_userid'].nunique())
print('Negative :', df_stat_camp_0['replier_userid'].nunique())

Total len 1:  7670
Total len 0:  786975
Positive : 7670
Negative : 786975


In [22]:
df_new['replier_userid'].nunique()

881918

In [23]:
df_one_camp = (df_new[['replier_userid', 'campaign', 'replier_label']]
               .groupby('replier_userid')
               .first()
               .reset_index()
              )

In [24]:
df_stat_camp_1 = df_one_camp.loc[df_one_camp['replier_label'] == 1]
df_stat_camp_0 = df_one_camp.loc[df_one_camp['replier_label'] == 0]

print('Total len 1: ', len(df_stat_camp_1))
print('Total len 0: ', len(df_stat_camp_0))
print('Positive :', df_stat_camp_1['replier_userid'].nunique())
print('Negative :', df_stat_camp_0['replier_userid'].nunique())

Total len 1:  7670
Total len 0:  874248
Positive : 7670
Negative : 874248


In [25]:
df_one_camp['replier_userid'] = df_one_camp['replier_userid'].astype(str)
df_stat['replier_userid'] = df_stat['replier_userid'].astype(str)

df_stat_camp = df_stat.merge(
    df_one_camp[['replier_userid', 'campaign']],
    on='replier_userid',
)

In [26]:
df_stat_camp.loc[df_stat_camp['campaign'].isnull()]

Unnamed: 0,replier_userid,replier_label,age,mean_mention_count,min_mention_count,25%_mention_count,50%_mention_count,75%_mention_count,max_mention_count,range_mention_count,...,mean_cosine,min_cosine,25%_cosine,50%_cosine,75%_cosine,max_cosine,range_cosine,iqr_cosine,entropy_cosine,campaign


In [27]:
df_stat_camp_1 = df_stat_camp.loc[df_stat_camp['replier_label'] == 1]
df_stat_camp_0 = df_stat_camp.loc[df_stat_camp['replier_label'] == 0]

print('Total len 1: ', len(df_stat_camp_1))
print('Total len 0: ', len(df_stat_camp_0))
print('Positive :', df_stat_camp_1['replier_userid'].nunique())
print('Negative :', df_stat_camp_0['replier_userid'].nunique())

Total len 1:  7678
Total len 0:  786975
Positive : 7670
Negative : 786975


In [28]:
df_stat_camp_1 = df_stat_camp.loc[df_stat_camp['replier_label'] == 1]
df_stat_camp_0 = df_stat_camp.loc[df_stat_camp['replier_label'] == 0]


df_1_grp = (df_stat_camp_1
            .groupby(['campaign'])['replier_userid']
            .nunique()
            .to_frame('count_1')
            .reset_index()
            .sort_values(
                by='count_1',
                ascending=False
            )
           )

print(df_1_grp.head())

df_0_grp = (df_stat_camp_0
            .groupby(['campaign'])['replier_userid']
            .nunique()
            .to_frame('count_0')
            .reset_index()
            .sort_values(
                by='count_0',
                ascending=False
            )
           )

print(df_0_grp.head())


               campaign  count_1
18  saudi_arabia_112019     1243
19        serbia_022020     1152
22        turkey_052020      999
8          egypt_022020      913
10      honduras_022020      875
               campaign  count_0
15      sa_eg_ae_022020   306945
16  saudi_arabia_112019   177248
20        turkey_052020   144614
6          egypt_022020    76700
3          china_082019    13318


In [29]:
df_stat_ok = df_1_grp.merge(
    df_0_grp[['campaign', 'count_0']],
    on='campaign',
)

In [30]:
df_stat_ok.head()

Unnamed: 0,campaign,count_1,count_0
0,saudi_arabia_112019,1243,177248
1,serbia_022020,1152,11885
2,turkey_052020,999,144614
3,egypt_022020,913,76700
4,honduras_022020,875,5383


#### **Check number of data in each campaign**

In [31]:
list_campaign = df_stat_ok['campaign'].head()

In [32]:
list_campaign

0    saudi_arabia_112019
1          serbia_022020
2          turkey_052020
3           egypt_022020
4        honduras_022020
Name: campaign, dtype: object

In [42]:
# df_merge

def get_statistics(campagin_name=None, 
                   df=None
                  ):
    '''
    Gets the features for the campaign data
    '''
    print(campagin_name)
    
    config = config_hp.config()
    camp_feat = config['USER_CAMP']['replier_camp_features']
    campaign_feat = os.path.join(camp_feat, 
                                 f'{campagin_name}_features.pkl.gz'
                                )
    df_camp_stat = df.loc[
        df['campaign'] == camp
    ]
   
    print('1 :', len(df_camp_stat.loc[df_camp_stat['replier_label'] == 1]))
    print('0 :', len(df_camp_stat.loc[df_camp_stat['replier_label'] == 0]))
    
    column_to_include = [x for x in df.columns if x not in ['campaign']]
    print(len(column_to_include))
    
    df_camp_stat[column_to_include].to_pickle(f'{campaign_feat}')
    
    print('** All features saved **')

In [43]:
for camp in list_campaign:
    print('Camp--- :', camp)
    
    get_statistics(campagin_name=camp, 
                   df=df_stat_camp
                  )
    print(camp, ' Done!')

Camp--- : saudi_arabia_112019
saudi_arabia_112019
1 : 1245
0 : 177248
78
** All features saved **
saudi_arabia_112019  Done!
Camp--- : serbia_022020
serbia_022020
1 : 1154
0 : 11885
78
** All features saved **
serbia_022020  Done!
Camp--- : turkey_052020
turkey_052020
1 : 999
0 : 144614
78
** All features saved **
turkey_052020  Done!
Camp--- : egypt_022020
egypt_022020
1 : 914
0 : 76700
78
** All features saved **
egypt_022020  Done!
Camp--- : honduras_022020
honduras_022020
1 : 875
0 : 5383
78
** All features saved **
honduras_022020  Done!


#### **Remaining dataset**

In [35]:
list_campaign

0    saudi_arabia_112019
1          serbia_022020
2          turkey_052020
3           egypt_022020
4        honduras_022020
Name: campaign, dtype: object

In [46]:
df_remain = df_stat_camp.loc[~df_stat_camp['campaign'].isin(list_campaign.tolist())]

print('0 :', df_remain.loc[df_remain['replier_label'] == 0]['replier_userid'].nunique())
print('1 :', df_remain.loc[df_remain['replier_label'] == 1]['replier_userid'].nunique())

df_remain['replier_userid'] = df_remain['replier_userid'].astype(str)

column_to_include = [x for x in df_remain.columns if x not in ['campaign']]
print(len(column_to_include))

config = config_hp.config()
camp_feat = config['USER_CAMP']['replier_camp_features']
campaign_feat = os.path.join(camp_feat, 
                             f'remain_features.pkl.gz'
                            )

df_remain.to_pickle(f'{campaign_feat}')

0 : 371145
1 : 2489
78


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_remain['replier_userid'] = df_remain['replier_userid'].astype(str)


#### **Check results**

In [37]:
list_campaign

0    saudi_arabia_112019
1          serbia_022020
2          turkey_052020
3           egypt_022020
4        honduras_022020
Name: campaign, dtype: object

In [47]:
for camp in ['remain']:
    config = config_hp.config()
    camp_feat = config['USER_CAMP']['replier_camp_features']
    campaign_feat = os.path.join(camp_feat, 
                                 f'{camp}_features.pkl.gz'
                                )
    print(camp, ' *** ')
    
    df_test = pd.read_pickle(campaign_feat)
    
    print(len(df_test.loc[df_test['replier_label'] == 1]))
    print(len(df_test.loc[df_test['replier_label'] == 0]))
    
    print(len(df_test.columns))
    

remain  *** 
2491
371145
79


In [50]:
df_test['replier_label'].unique()

array([0, 1])