##### **This notebook tests whether there is common user replying behavior between IO and control accounts.**
Control tha has been collected by Alex.

In [None]:
import pandas as pd
import numpy as np
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from tqdm import tqdm
import sys
import os

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as vz
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config

#### Loading config

In [None]:
config = config.config()
path = config['PATHS']

derived_path = path['derived_path']
all_tweet_data = path['all_tweet_data']

In [None]:
test_campaign = 'iran_202012'
year = '2020_12'
ops_file_path = os.path.join(all_tweet_data, 
                             year, 
                             test_campaign, 
                            f'{test_campaign}_tweets.pkl.gz')
control_file_path = os.path.join(all_tweet_data, 
                                 year, 
                                 test_campaign, 
                                 'DriversControl', 
                            f'{test_campaign}_control.pkl.gz')

data = st.read_ops_control_data(ops_file_path, 
                             control_file_path, 
                             ['ops', 'control'])
df_ops = data['ops']
df_control = data['control']

In [None]:
# df_ops.info()

In [None]:
def check_if_common_user(df_ops, df_control):
    '''
    Checks if there are common replied to tweetid and userid
    
    :param df_ops: First Dataframe
    :param df_control: Second Dataframe
    '''
    
    df_ops = df_ops.loc[~df_ops['in_reply_to_tweetid'].isnull()]
    df_control = df_control.loc[~df_control['in_reply_to_tweetid'].isnull()]
    
    df_ops = df_ops.astype({
        'in_reply_to_tweetid': int,
        'userid': int,
        'in_reply_to_userid': int
    })
    
    df_control = df_control.astype({
        'in_reply_to_tweetid': int,
        'userid': int,
        'in_reply_to_userid': int
    })
    
    df_merge_replied = df_ops.merge(
        df_control[['userid', 'in_reply_to_tweetid']], 
        on='in_reply_to_tweetid')
    
    total_tweetid = len(df_merge_replied['in_reply_to_tweetid'].unique())
    
    print('common in_reply_to_tweetid ', total_tweetid)
    
    df_merge_userid = df_ops.merge(
        df_control[['userid', 'in_reply_to_userid']], 
        on='in_reply_to_userid')
    
    total_user = len(df_merge_userid['in_reply_to_userid'].unique())
    
    print('Common users ', total_user)
    
    return total_tweetid, total_user
    
def get_original_tweet(df):
    '''
    Gets only original tweet
    
    :param df: Dataframe
    '''
    
    flag_reply = df['in_reply_to_tweetid'].isnull()
    flag_quoted = True
    
    if 'quoted_tweet_tweetid' in df.columns:
        flag_quoted = df['quoted_tweet_tweetid'].isnull()
        
    flag_retweet = df['is_retweet'] == False

    df_orignal_tweet = df.loc[flag_reply & flag_quoted & flag_retweet]

    return df_orignal_tweet


def extract_mentions_from_user_mentions(df,
                                        explode=True):
    '''
    Extracts mentions from user_mentions column in which the 
    list of mentioned userids are in string form
    
    :param df: DataFrame
    :param explode: Boolean to specify whether to explode the list
    '''
    
    df_mentions = df.loc[~df['user_mentions'].isnull()]

    df_mentions['user_mentions'] = df_mentions['user_mentions'].apply(
        lambda x: x.strip('][')
    )

    df_mentions = df_mentions.loc[
        df_mentions['user_mentions'].map(len) != 0]

    df_mentions['user_mentions'] = df_mentions['user_mentions'].apply(
        lambda x: x.split(', ')
    )

    df_mentions = df_mentions.explode('user_mentions')
    df_mentions = df_mentions.astype({
        'user_mentions': int
    })

    if explode == True:
        df_mentions = df_mentions.loc[df_mentions['user_mentions'] != 0]
        
    return df_mentions
    

def check_common_mentions(df_ops, df_control):
    '''
    Prints the number of common mentions betweent two dataframe
    
    :param df_ops: first dataframe
    :param df_control: second dataframe
    '''
    
    df_ops = get_original_tweet(df_ops)
    df_control = get_original_tweet(df_control)
    df_ops = extract_mentions_from_user_mentions(df_ops)
    df_control = extract_mentions_from_control(df_control)
    
    df_merge_mentions = df_ops.merge(
        df_control[['userid', 'user_mentions']], 
        on='user_mentions')
    
    total_mentions = len(df_merge_mentions['user_mentions'].unique())
    
    print('common mentions ', total_mentions)
    
    return total_mentions
    

def extract_mentions_from_control(df, explode=True):
    '''
    Extracts the user mentions from dictionary of user mentions
    
    :param df: DataFrame
    :param explode: Blooean to whether to explode the user mentions
    list
    
    :return DataFrame
    '''
    
    df['user_mentions'] = df['user_mentions'].apply(
        lambda x: [y['id'] for y in x]
    )
    
    df = df.loc[
        df['user_mentions'].map(len) != 0]
    
    df = df.explode('user_mentions')
    
    df = df.astype({
        'user_mentions': int
    })
    
    return df

In [None]:
check_if_common_user(df_ops, df_control)

In [None]:
#-------------------------------------------------------------------
## the common reply between control and op is 18 
## but that I got from 
## conversation id just 3

#-------------------------------------------------------------------
## Replying to same user can be indicator 
## while mixing all control and ops
## What else can we add??

In [None]:
all_campaigns, names = st.bundle_campaign()
total_info = []

for row in all_campaigns:
    for year in row:
        for new_campaign in row[year]:

            print(f'\n ------START: {year}: {new_campaign} ------- \n')

            ops_file_path = os.path.join(all_tweet_data, 
                                         year, 
                                         new_campaign, 
                                         f'{new_campaign}_tweets.pkl.gz')
            control_file_path = os.path.join(all_tweet_data, 
                                 year, 
                                 new_campaign, 
                                 'DriversControl', 
                            f'{new_campaign}_tweets_control.pkl.gz')
            
            data = st.read_ops_control_data(ops_file_path, 
                                         control_file_path, 
                                         ['ops', 'control'])
            
            if len(data['control']) == 0:
                control_file_path = os.path.join(all_tweet_data, 
                                                 year, 
                                                 new_campaign, 
                                                 'DriversControl', 
                                                 f'{new_campaign}_control.pkl.gz')
                data['control'] = st.read_ops_control_data(ops_file_path, 
                                         control_file_path, 
                                         ['control'])['control']
                
                if len(data['control']) == 0:
                    print(f'\n ------END: {year}: {new_campaign} ------- \n')
                    continue
            
            df_ops = data['ops']
            df_control = data['control']
            
            total_tweetid, total_user = check_if_common_user(
                df_ops, df_control)
            total_mentions = check_common_mentions(df_ops, df_control)
            
            total_info.append([total_tweetid, total_user, total_mentions,
                               year, new_campaign
                              ])
            print(f'\n ------END: {year}: {new_campaign} ------- \n')
            
            
(
    pd.DataFrame(data=total_info, columns=['common_in_reply_to_tweetid', 
                                           'common_users',
                                           'common_mentions', 
                                           'year',
                                           'campaign'])
)

In [None]:
### The replied to tweetid overlap between
### control and IO data is very less however
### overlap of users is high


### For mentions overlap, 
### there are quite few overlaps between users