#### **This notebook creates campaign wise features for tweet classifier**

In [3]:
import pandas as pd
import numpy as np
import warnings
import os

import importlib

#### packages
import helper.strategy_helper as st_hp
import config.config as config_hp
import helper.stat_helper as stat_hp

#### **Load targeted and control : tweet and replies data files**

In [5]:
#All targeted tweets and their replies
balanced_pos_conversation = './../data/balanced_positive_conversation.pkl.gz'

df_pos = pd.read_pickle(balanced_pos_conversation)

print(df_pos.columns)


#All control tweets and their replies
balanced_neg_conversation = './../data/balanced_negative_conversation.pkl.gz'

df_neg = pd.read_pickle(balanced_neg_conversation)

Index(['tweet_text', 'conversation_id', 'replier_tweetid', 'replier_userid',
       'poster_userid', 'poster_tweetid', 'tweet_time', 'tweet_language',
       'replier_label', 'year', 'campaign', 'tweet_label', 'tweet_time_year',
       'common', 'id', 'username'],
      dtype='object')

#### **Get campaign related data**

In [None]:
import random

def get_camp_data(df_pos, df_neg, campaign):
    if campaign == 'remain':
        top_5 =  df_grp_camp['campaign'].head().tolist()
        df_pos_camp = df_pos.loc[~df_pos['campaign'].isin(top_5)]
    else:
        df_pos_camp = df_pos.loc[df_pos['campaign'] == campaign]
    
    #find number of tweets of each user in positive
    df_pos_grp = (df_pos_camp
                  .groupby('poster_userid')['poster_tweetid']
                  .nunique()
                  .to_frame('count_pos')
                  .reset_index()
                 )
    
    print(df_pos_camp['poster_tweetid'].nunique())
    
    #Get users in negative
    df_neg_camp = df_neg.loc[df_neg['poster_userid'].isin(
        df_pos_camp['poster_userid']
    )]
    
    #find out how many tweets in negative set
    df_neg_grp = (df_neg_camp
                  .groupby('poster_userid')['poster_tweetid']
                  .nunique()
                  .to_frame('count_neg')
                  .reset_index()
                 )
    
    #Find least of all
    df_grp = df_pos_grp.merge(df_neg_grp,
                              on='poster_userid',
                             )
    df_grp['min_count'] = df_grp[['count_pos', 'count_neg']].min(axis=1)
   
    #sample least in negative dataset
    df_neg_common = df_neg.merge(df_grp,
                              on='poster_userid'
                             )
    
    #Balance the data in negative
    sampled_df = df_neg_common.groupby(['poster_userid'], group_keys=False).apply(
        lambda group: group.loc[group['poster_tweetid'].isin(
            random.sample(list(set(group['poster_tweetid'])),
                          list(set(group['min_count']))[0]
                         )
        )]
    ).reset_index(drop=True)
    
    sampled_df.drop(columns=['min_count', 
                             'count_pos',
                             'count_neg',
                             'common'
                            ],
                    inplace=True)

    df_pos_camp.drop('common',
                     inplace=True,
                     axis=1
                    )
    
    #Get rest of the data
    df_rest_pos = df_pos.loc[
        ~df_pos['poster_tweetid'].isin(df_pos_camp['poster_tweetid'])
    ]
    
    df_rest_neg = df_neg.loc[~df_neg['poster_tweetid'].isin(sampled_df['poster_tweetid'])]
    
    df_rest = pd.concat([df_rest_pos, df_rest_neg],
                        ignore_index=True
                       )
    
    print('Total +ve :', df_pos_camp['poster_tweetid'].nunique())
    print('Total -ve :', sampled_df['poster_tweetid'].nunique())
    print('Total Rest +ve :', df_rest_pos['poster_tweetid'].nunique())
    print('Total Rest -ve :', df_rest_neg['poster_tweetid'].nunique())
    
    df_camp = pd.concat([df_pos_camp, sampled_df],
                        ignore_index=True
                       )
    
    return df_camp, df_rest

In [4]:
def save_list_cosine():
    '''
    Load cosine files
    
    :return Dataframe of positive and negative class
    '''
    config = config_hp.config()
    pos_cosine = config['EMBEDDINGS_PATH']['pos_cosine_with_replier_info']
    neg_cosine = config['EMBEDDINGS_PATH']['neg_cosine_with_replier_info']
    
    print('\n\n Load the data \n\n')
    
    # df_pos = pd.read_pickle(pos_cosine)
    df_neg = pd.read_pickle(neg_cosine)
    
    # df_pos['poster_tweetid'] = df_pos['poster_tweetid'].astype(str)
    df_neg['poster_tweetid'] = df_neg['poster_tweetid'].astype(str)
    
    save_path = os.path.join(config['EMBEDDINGS_PATH']['embedding_path'],
                             'negative_list_cosine.pkl.gz'
                            )
    print('\n\n Grouping the data \n\n')
    print(f'\n Saving the data {save_path} \n')
    
    (df_neg
     .groupby(['poster_tweetid'])['cosine'].apply(list)
     .reset_index()
    ).to_pickle(save_path)

In [15]:
def load_cosine():
    config = config_hp.config()
    pos_cosine = config['EMBEDDINGS_PATH']['pos_list_cosine']
    neg_cosine = config['EMBEDDINGS_PATH']['neg_list_cosine']
    
    print('\n\n Load the data \n\n')
    
    df_pos = pd.read_pickle(pos_cosine)
    df_neg = pd.read_pickle(neg_cosine)
    
    print(df_pos.head())
    print(df_neg.head())
    
    return df_pos, df_neg

In [11]:
# df_pos_cosine, df_neg_cosine = load_cosine()

In [16]:
def load_cosine_stat():
    config = config_hp.config()
    cosine = config['STATS']['cosine']
   
    
    print('\n\n Load the data \n\n')
    
    df_cosine = pd.read_pickle(cosine)
    
    return df_cosine

In [17]:
df_cosine_stat = load_cosine_stat()



 Load the data 




#### **Get features for each campaign**

In [34]:
import datetime
import re
import helper.helper as hp

In [39]:
def get_statistics(df_camp=None, 
                   campagin_name=None, 
                   df_stat=None
                  ):
    '''
    Gets the features for the campaign data
    '''
    print(campagin_name)
    
    campaign_feat = os.path.join('./../data', 
                                 f'{campagin_name}_features.pkl.gz'
                                )
    print(campaign_feat)
    
    df_camp['poster_tweetid'] = df_camp['poster_tweetid'].astype(str)
    df_stat['poster_tweetid'] = df_stat['poster_tweetid'].astype(str)
    
    df_camp_stat = df_stat.loc[
        df_stat['poster_tweetid'].isin(
            df_camp['poster_tweetid']
        )
    ]
   
    df_camp_stat.to_pickle(f'{campaign_feat}')
    
    print('** All features saved **')

#### **Features for top 5 campaign and rest(others)**

In [None]:
df_grp_camp = (df_pos
               .groupby(['campaign'])['poster_tweetid']
               .nunique(dropna=False)
               .to_frame('count')
               .reset_index()
               .sort_values(by=['count'],
                            ascending=False
                           )
               
              )

In [40]:
df_grp_camp['campaign'].head()

20          serbia_022020
19    saudi_arabia_112019
23          turkey_052020
8            egypt_022020
18        sa_eg_ae_022020
Name: campaign, dtype: object

In [None]:
total = 0
camp_list = df_grp_camp['campaign'].head().tolist()
camp_list = camp_list + ['remain']

tweet_features = './../data/tweet_classifier_features.pkl.gz'

df_stat = pd.read_pickle(tweet_features)

for campaign in camp_list:
    print('Campaign ', campaign) 
    
    df_camp, df_rest = get_camp_data(df_new, df_neg, campaign)
    
    get_statistics(df_camp,
                   campaign, 
                   df_stat
                   )