#### **This notebook creates campaign wise features for tweet classifier**

In [2]:
import pandas as pd
import numpy as np
import warnings
import os

import importlib

#### packages
import helper.strategy_helper as st_hp
import config.config as config_hp
import helper.stat_helper as stat_hp

#### **Saving only derived data for tweets and their campaign**

In [3]:
#All targeted tweets and their replies
# balanced_pos_conversation = './../data/balanced_positive_conversation.pkl.gz'

# df_pos = pd.read_pickle(balanced_pos_conversation)

# print(df_pos.columns)

# df_pos[['poster_tweetid', 'campaign', 
#         'replier_userid', 'replier_label',
#         'replier_tweetid'
#        ]].to_pickle('./../data/positive_poster_tweetid_campaign.pkl.gz')


#All control tweets and their replies
# balanced_neg_conversation = './../data/balanced_negative_conversation.pkl.gz'

# df_neg = pd.read_pickle(balanced_neg_conversation)

# df_neg[['poster_tweetid', 'campaign', 
#         'replier_userid', 'replier_label',
#         'replier_tweetid'
#        ]].to_pickle('./../data/control_poster_tweetid_campaign.pkl.gz')

Index(['tweet_text', 'conversation_id', 'replier_tweetid', 'replier_userid',
       'poster_userid', 'poster_tweetid', 'tweet_time', 'tweet_language',
       'replier_label', 'year', 'campaign', 'tweet_label', 'tweet_time_year',
       'common', 'id', 'username'],
      dtype='object')


#### **Load targeted and control : tweet and replies data files**

In [None]:
df_pos = pd.read_pickle('./../data/positive_poster_tweetid_campaign.pkl.gz')
df_neg = pd.read_pickle('./../data/control_poster_tweetid_campaign.pkl.gz')

In [4]:
len(df_pos)

2160484

In [5]:
len(df_neg)

512607

#### **Get campaign related data**

In [3]:
import random

def get_camp_data(df_pos, df_neg, campaign):
    '''
    Get campaign specific tweet features
    :param df_pos: Positive targeted features
    :param df_neg: Negative control features
    :param campaign: Name of campaign

    :return Campagin features, rest of the features
    '''
    
    if campaign == 'remain':
        top_5 =  df_grp_camp['campaign'].head().tolist()
        df_pos_camp = df_pos.loc[~df_pos['campaign'].isin(top_5)]
    else:
        df_pos_camp = df_pos.loc[df_pos['campaign'] == campaign]
    
    #Get users in negative
    df_neg_camp = df_neg.loc[df_neg['poster_userid'].isin(
        df_pos_camp['poster_userid']
    )]
   
    #Get rest of the data
    df_rest_pos = df_pos.loc[
        ~df_pos['poster_tweetid'].isin(df_pos_camp['poster_tweetid'])
    ]
    
    df_rest_neg = df_neg.loc[~df_neg['poster_tweetid'].isin(df_neg_camp['poster_tweetid'])]
    
    df_rest = pd.concat([df_rest_pos, df_rest_neg],
                        ignore_index=True
                       )
    
    print('Total +ve :', df_pos_camp['poster_tweetid'].nunique())
    print('Total -ve :', df_neg_camp['poster_tweetid'].nunique())
    print('Total Rest +ve :', df_rest_pos['poster_tweetid'].nunique())
    print('Total Rest -ve :', df_rest_neg['poster_tweetid'].nunique())
    
    df_camp = pd.concat([df_pos_camp, df_neg_camp],
                        ignore_index=True
                       )
    
    return df_camp, df_rest

#### **Get features for each campaign**

In [4]:
import datetime
import re
import helper.helper as hp

In [5]:
def get_statistics(df_camp=None, 
                   campagin_name=None, 
                   df_stat=None
                  ):
    '''
    Gets the features for the campaign data
    '''
    print(campagin_name)
    
    campaign_feat = os.path.join('./../results', 
                                 f'tweet_classifier_{campagin_name}_features.pkl.gz'
                                )
    print(campaign_feat)
    
    df_camp['poster_tweetid'] = df_camp['poster_tweetid'].astype(str)
    df_stat['poster_tweetid'] = df_stat['poster_tweetid'].astype(str)
    
    df_camp_stat = df_stat.loc[
        df_stat['poster_tweetid'].isin(
            df_camp['poster_tweetid']
        )
    ]
   
    df_camp_stat.to_pickle(f'{campaign_feat}')
    
    print('** All features saved **')

#### **Features for top 5 campaign and rest(others)**

In [6]:
df_grp_camp = (df_pos
               .groupby(['campaign'])['poster_tweetid']
               .nunique(dropna=False)
               .to_frame('count')
               .reset_index()
               .sort_values(by=['count'],
                            ascending=False
                           )
               
              )

In [7]:
df_grp_camp['campaign'].head()

21          serbia_022020
20    saudi_arabia_112019
24          turkey_052020
19        sa_eg_ae_022020
8            egypt_022020
Name: campaign, dtype: object

#### **Check if there are common tweets**

In [8]:
list_camp = df_grp_camp['campaign'].head().tolist()
for camp_x in list_camp:
    df_camp_x = df_pos.loc[df_pos['campaign'] == camp_x]
    for camp_y in list_camp:
        if camp_x == camp_y:
            continue
        
        df_camp_y = df_pos.loc[df_pos['campaign'] == camp_y]
    
        set_common = set(df_camp_x['poster_tweetid']).intersection(set(df_camp_y['poster_tweetid']))
        print(f'{camp_x} and {camp_y}:', len(set_common))

list_camp = df_grp_camp['campaign'].head().tolist()
df_camp_x = df_pos.loc[~df_pos['campaign'].isin(list_camp)]
for camp_y in list_camp:
    df_camp_y = df_pos.loc[df_pos['campaign'] == camp_y]

    set_common = set(df_camp_x['poster_tweetid']).intersection(set(df_camp_y['poster_tweetid']))
    print(f'remain and {camp_y}:', len(set_common))



serbia_022020 and saudi_arabia_112019: 0
serbia_022020 and turkey_052020: 0
serbia_022020 and sa_eg_ae_022020: 0
serbia_022020 and egypt_022020: 0
saudi_arabia_112019 and serbia_022020: 0
saudi_arabia_112019 and turkey_052020: 0
saudi_arabia_112019 and sa_eg_ae_022020: 0
saudi_arabia_112019 and egypt_022020: 0
turkey_052020 and serbia_022020: 0
turkey_052020 and saudi_arabia_112019: 0
turkey_052020 and sa_eg_ae_022020: 0
turkey_052020 and egypt_022020: 0
sa_eg_ae_022020 and serbia_022020: 0
sa_eg_ae_022020 and saudi_arabia_112019: 0
sa_eg_ae_022020 and turkey_052020: 0
sa_eg_ae_022020 and egypt_022020: 0
egypt_022020 and serbia_022020: 0
egypt_022020 and saudi_arabia_112019: 0
egypt_022020 and turkey_052020: 0
egypt_022020 and sa_eg_ae_022020: 0
remain and serbia_022020: 0
remain and saudi_arabia_112019: 0
remain and turkey_052020: 0
remain and sa_eg_ae_022020: 0
remain and egypt_022020: 0


#### **Get campaign features**

In [9]:
total = 0
camp_list = df_grp_camp['campaign'].head().tolist()
camp_list = camp_list + ['remain']

tweet_features = './../data/tweet_classifier_features.pkl.gz'

df_stat = pd.read_pickle(tweet_features)

for campaign in camp_list:
    print('Campaign ', campaign) 
    
    df_camp, df_rest = get_camp_data(df_new, df_neg, campaign)
    
    get_statistics(df_camp,
                   campaign, 
                   df_stat
                   )

Campaign  serbia_022020
Campaign  saudi_arabia_112019
Campaign  turkey_052020
Campaign  sa_eg_ae_022020
Campaign  egypt_022020
Campaign  remain
