#### **This notebook creates campaign wise features for tweet classifier**

In [3]:
import pandas as pd
import numpy as np
import warnings
import os

import importlib

#### packages
import helper.strategy_helper as st_hp
import config.config as config_hp
import helper.stat_helper as stat_hp

#### **Load files**

In [4]:
config = config_hp.config()

balanced_pos = config['BALANCED']['balanced_pos_conversation']
balanced_neg = config['BALANCED']['balanced_neg_conversation']

df_pos = pd.read_pickle(balanced_pos)
df_neg = pd.read_pickle(balanced_neg)

In [5]:
df_pos.columns

Index(['tweet_text', 'conversation_id', 'replier_tweetid', 'replier_userid',
       'poster_userid', 'poster_tweetid', 'tweet_time', 'tweet_language',
       'replier_label', 'year', 'campaign', 'tweet_label', 'tweet_time_year',
       'common', 'id', 'username'],
      dtype='object')

#### **Add campaign info to all poster tweets**

In [6]:
def add_camapign_info(df):
    '''
    Adds the campaign data into positive dataset
    
    :param df: Dataframe
    '''
    df_no_null = df.loc[~df['campaign'].isnull()]

    df_grp_name = (df_no_null
              .groupby(['poster_tweetid'])['campaign']
               .first()
               .reset_index()
              )
    df_new = df.merge(df_grp_name,
                      on='poster_tweetid',
                     )
    df_new.drop(columns=['campaign_x'], inplace=True)
    df_new.rename(columns={'campaign_y': 'campaign'}, inplace=True)
    
    return df_new

In [7]:
df_no_null = df_pos.loc[~df_pos['campaign'].isnull()]

df_no_null['poster_tweetid'].nunique()

3866

In [8]:
df_grp_name = (df_no_null
              .groupby(['poster_tweetid'])['campaign']
               .first()
               .reset_index()
              )

Adds the campaign info in all the tweets

In [9]:
df_new = df_pos.merge(df_grp_name,
                      on='poster_tweetid',
                     )

In [10]:
df_new.drop(columns=['campaign_x'], inplace=True)
df_new.rename(columns={'campaign_y': 'campaign'},
              inplace=True)

In [11]:
# df_new.loc[df_new['campaign'].isnull()]

#### **Check number of data in each campaign**

Gets the campaign specific and rest of the data

In [12]:
df_grp_camp = (df_new
               .groupby(['campaign'])['poster_tweetid']
               .nunique(dropna=False)
               .to_frame('count')
               .reset_index()
               .sort_values(by=['count'],
                            ascending=False
                           )
               
              )

In [13]:
df_grp_camp.head()

Unnamed: 0,campaign,count
20,serbia_022020,1140
19,saudi_arabia_112019,888
23,turkey_052020,660
8,egypt_022020,330
18,sa_eg_ae_022020,219


#### **Get data related to campaign**

In [14]:
import random

def get_camp_data(df_pos, df_neg, campaign):
    if campaign == 'remain':
        top_5 =  df_grp_camp['campaign'].head().tolist()
        df_pos_camp = df_pos.loc[~df_pos['campaign'].isin(top_5)]
    else:
        df_pos_camp = df_pos.loc[df_pos['campaign'] == campaign]
    
    #find number of tweets of each user in positive
    df_pos_grp = (df_pos_camp
                  .groupby('poster_userid')['poster_tweetid']
                  .nunique()
                  .to_frame('count_pos')
                  .reset_index()
                 )
    
    print(df_pos_camp['poster_tweetid'].nunique())
    
    #Get users in negative
    df_neg_camp = df_neg.loc[df_neg['poster_userid'].isin(
        df_pos_camp['poster_userid']
    )]
    
    #find out how many tweets in negative set
    df_neg_grp = (df_neg_camp
                  .groupby('poster_userid')['poster_tweetid']
                  .nunique()
                  .to_frame('count_neg')
                  .reset_index()
                 )
    
    #Find least of all
    df_grp = df_pos_grp.merge(df_neg_grp,
                              on='poster_userid',
                             )
    df_grp['min_count'] = df_grp[['count_pos', 'count_neg']].min(axis=1)
   
    #sample least in negative dataset
    df_neg_common = df_neg.merge(df_grp,
                              on='poster_userid'
                             )
    
    #Balance the data in negative
    sampled_df = df_neg_common.groupby(['poster_userid'], group_keys=False).apply(
        lambda group: group.loc[group['poster_tweetid'].isin(
            random.sample(list(set(group['poster_tweetid'])),
                          list(set(group['min_count']))[0]
                         )
        )]
    ).reset_index(drop=True)
    
    sampled_df.drop(columns=['min_count', 
                             'count_pos',
                             'count_neg',
                             'common'
                            ],
                    inplace=True)

    df_pos_camp.drop('common',
                     inplace=True,
                     axis=1
                    )
    
    #Get rest of the data
    df_rest_pos = df_pos.loc[
        ~df_pos['poster_tweetid'].isin(df_pos_camp['poster_tweetid'])
    ]
    
    df_rest_neg = df_neg.loc[~df_neg['poster_tweetid'].isin(sampled_df['poster_tweetid'])]
    
    df_rest = pd.concat([df_rest_pos, df_rest_neg],
                        ignore_index=True
                       )
    
    print('Total +ve :', df_pos_camp['poster_tweetid'].nunique())
    print('Total -ve :', sampled_df['poster_tweetid'].nunique())
    print('Total Rest +ve :', df_rest_pos['poster_tweetid'].nunique())
    print('Total Rest -ve :', df_rest_neg['poster_tweetid'].nunique())
    
    df_camp = pd.concat([df_pos_camp, sampled_df],
                        ignore_index=True
                       )
    
    return df_camp, df_rest

#### **Testing if there are equal tweets in both positive and negative**

In [4]:
def save_list_cosine():
    '''
    Load cosine files
    
    :return Dataframe of positive and negative class
    '''
    config = config_hp.config()
    pos_cosine = config['EMBEDDINGS_PATH']['pos_cosine_with_replier_info']
    neg_cosine = config['EMBEDDINGS_PATH']['neg_cosine_with_replier_info']
    
    print('\n\n Load the data \n\n')
    
    # df_pos = pd.read_pickle(pos_cosine)
    df_neg = pd.read_pickle(neg_cosine)
    
    # df_pos['poster_tweetid'] = df_pos['poster_tweetid'].astype(str)
    df_neg['poster_tweetid'] = df_neg['poster_tweetid'].astype(str)
    
    save_path = os.path.join(config['EMBEDDINGS_PATH']['embedding_path'],
                             'negative_list_cosine.pkl.gz'
                            )
    print('\n\n Grouping the data \n\n')
    print(f'\n Saving the data {save_path} \n')
    
    (df_neg
     .groupby(['poster_tweetid'])['cosine'].apply(list)
     .reset_index()
    ).to_pickle(save_path)

In [15]:
def load_cosine():
    config = config_hp.config()
    pos_cosine = config['EMBEDDINGS_PATH']['pos_list_cosine']
    neg_cosine = config['EMBEDDINGS_PATH']['neg_list_cosine']
    
    print('\n\n Load the data \n\n')
    
    df_pos = pd.read_pickle(pos_cosine)
    df_neg = pd.read_pickle(neg_cosine)
    
    print(df_pos.head())
    print(df_neg.head())
    
    return df_pos, df_neg

In [11]:
# df_pos_cosine, df_neg_cosine = load_cosine()

In [16]:
def load_cosine_stat():
    config = config_hp.config()
    cosine = config['STATS']['cosine']
   
    
    print('\n\n Load the data \n\n')
    
    df_cosine = pd.read_pickle(cosine)
    
    return df_cosine

In [17]:
df_cosine_stat = load_cosine_stat()



 Load the data 




#### **Find campaign specific cosine**

In [30]:
def cosine(df_camp=None, 
           campaign=None,
           df_cosine_stat=None
          ):
    '''
    Calculates the summary statistics of the cosine similarity
    :param df_camp: Dataframe of campaign
    :param stats: List of dataframe
    :param df_pos: Dataframe of positive class that has cosine
    :param df_neg: Dataframe of negative class that has cosine
    
    :return dataframe-
    '''
    
    print('**** Cosine starting ****')
    
    config = config_hp.config()
    camp_cosine = config['CAMP_FEAT']['camp_features']
    campaign_cosine = os.path.join(camp_cosine, 
                                   f'{campaign}_cosine.pkl.gz'
                                  )
    
    if os.path.exists(campaign_cosine):
        df = pd.read_pickle(campaign_cosine)
        
        return df
    
    
    df_camp['poster_tweetid'] = df_camp['poster_tweetid'].astype(str)
    df_cosine_stat['poster_tweetid'] = df_cosine_stat['poster_tweetid'].astype(str)
    
    df_camp_stat = df_cosine_stat.loc[
        df_cosine_stat['poster_tweetid'].isin(
            df_camp['poster_tweetid']
        )
    ]

    print('Class :', df_camp_stat['tweet_label'].unique())
    
    df_camp_stat = df_camp_stat.drop(columns=['list_cosine', 'var_cosine'])
    
    df_camp_stat.to_pickle(f'{campaign_cosine}')
    
    print('file saved')
    
    return df_camp_stat


#### **Save campaign specific cosine score**

In [None]:
camp_list = df_grp_camp['campaign'].head().tolist()
camp_list.append('remain')

for campaign in camp_list:
    print('Campaign ', campaign) 
    df_camp, df_rest = get_camp_data(df_new, df_neg, campaign)
    
    cosine(df_camp=df_camp, 
           campaign=campaign,
           df_cosine_stat=df_cosine_stat
          )
    
    # break

#### **Get statistics**

In [34]:
import datetime
import re
import helper.helper as hp

In [39]:
def get_statistics(df_camp=None, 
                   campagin_name=None, 
                   df_stat=None
                  ):
    '''
    Gets the features for the campaign data
    '''
    print(campagin_name)
    
    config = config_hp.config()
    camp_feat = config['CAMP_FEAT']['camp_features']
    campaign_feat = os.path.join(camp_feat, 
                                 f'{campagin_name}_features.pkl.gz'
                                )
    print(campaign_feat)
    
    df_camp['poster_tweetid'] = df_camp['poster_tweetid'].astype(str)
    df_stat['poster_tweetid'] = df_stat['poster_tweetid'].astype(str)
    
    df_camp_stat = df_stat.loc[
        df_stat['poster_tweetid'].isin(
            df_camp['poster_tweetid']
        )
    ]
   
    df_camp_stat.to_pickle(f'{campaign_feat}')
    
    print('** All features saved **')

#### **Run stat for each campaign**

In [40]:
df_grp_camp['campaign'].head()

20          serbia_022020
19    saudi_arabia_112019
23          turkey_052020
8            egypt_022020
18        sa_eg_ae_022020
Name: campaign, dtype: object

In [43]:
total = 0
# camp_list = df_grp_camp['campaign'].head().tolist()
camp_list = ['remain']

config = config_hp.config()
final_stat = config['STATS']['final_stat']
df_stat = pd.read_pickle(final_stat)

for campaign in camp_list:
    print('Campaign ', campaign) 
    
    df_camp, df_rest = get_camp_data(df_new, df_neg, campaign)
    
    get_statistics(df_camp,
                   campaign, 
                   df_stat
                   )
    
    # break

Campaign  remain
629


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pos_camp.drop('common',


Total +ve : 629
Total -ve : 629
Total Rest +ve : 3237
Total Rest -ve : 3237
remain
/N/slate/potem/data/derived/camp_features/remain_features.pkl.gz
** All features saved **
