### **This notebook creates the reply coordination graph. The reply here is to accounts not involved in IO**

In [1]:
import pandas as pd
import numpy as np
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from tqdm import tqdm
import sys
import os

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as vz
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config

In [2]:
config = config.config()
path = config['PATHS']

derived_path = path['derived_path']
all_tweet_data = path['all_tweet_data']

In [3]:
def convert_tweetids_to_string(df_time, field='tweetid',
                               filter_threshold=False,
                               threshold=None
                              ) -> pd.DataFrame:
    '''
    Converts the time binned tweets to single single dataframe and convert the tweets \
    to single document tweets for each users
    
    :param df_time: dictionary of time binned dataframes
    
    :return pandas dataframe
    '''

    df_time = df_time.astype({field: int})
    df_time = df_time.astype({field: str})
    
    df_time = (df_time
                   .groupby('userid')[field]
                   .apply(list)
                   .reset_index(name='tweet_ids'))
        
    if filter_threshold:
        df_time = df_time.loc[df_time['tweet_ids'].map(len) > threshold]
        
    
    df_time['tweet_ids'] = df_time['tweet_ids'].apply(lambda x: ' '.join(x))
    
    return df_time

In [4]:
def coordination_reply(df, output_path, campaign_name):
    print('\n----- Start: Filtering tweets ---------')
    df = filter_reply_count(df, 5)
    print('----- End: Filtering tweets ---------\n')
    
    print('\n----- Start: Create bi-partite network -----')
    df_network = create_user_projection(df, 'in_reply_to_userid')
    print('\n----- End: Create bi-partite network -----')
    
    print('\n----- Start: Create co-reply strings  -----')
    df_string = convert_tweetids_to_string(df, 
                                           field='in_reply_to_userid')
    print('\n----- End: Create co-reply strings -----')
    
    print('\n----- Start: Calculate tfidf vector -----')
    df_string = calculate_tf_idf_vector(df_string)
    print('\n------ End: Calculate tfidf vector -----')
    
    print('\n----- Start: Retweet user projection network ------')
    df_new = calculate_cosine_similarity(df_network, df_string)
    print('\n----- End: Retweet user projection network -----')
       
    pkl_path = os.path.join(output_path, 
                            f'{campaign_name}_network.pkl.gz')
    
    df_new.to_pickle(f'{pkl_path}')
    
    print('\n ----- Start: Create co-retweet graph -----')
    create_graph(df_new, 
                 output_path,
                 campaign_name=campaign_name,
                 source_column='source',
                 target_column='target',
                 weight_column='cosine',
                 type_text='co-reply')
    print(f'----- End: Creating user projection network ---------\n')
        
    return df_new

In [5]:
create_folder(save_path, 'reply_coordination')

coordination_path = os.path.join(save_path, 
                                 'reply_coordination')
total_path = os.path.join(save_path, 
                          'reply_coordination', 
                          f'{iran_202012_campaign}_network.pkl.gz')

if os.path.isfile(total_path) == False:
    df_ops = reply_to_external_users(df_ops)
    df_coordination = coordination_reply(df_ops, 
                                         coordination_path,
                                         iran_202012_campaign)
    
    #Top 5% users
    per_5 = int(len(df_coordination) * 0.05)

    df_coordination = df_coordination.sort_values(by=['cosine'],
                                                  ascending=False
                                                 )

    df_head = df_coordination.head(per_5)

    users = list(set(list(df_head['source']) + list(df_head['target'])))

    print('Total unique users: ', len(users))   


    #Text for manual inspection
    df_text_user_1 = df_ops.loc[df_ops['userid'].isin(df_head['source'])][['in_reply_to_userid',
                                              'in_reply_to_tweetid',
                                              'userid', 'tweet_text'
                                             ]]
    df_text_user_1 = df_text_user_1.sort_values(by=['in_reply_to_userid', 
                                      'in_reply_to_tweetid',
                                      'userid', 'tweet_text'
                                     ])

    df_text_user_2 = df_ops.loc[df_ops['userid'].isin(df_head['target'])][['in_reply_to_userid',
                                              'in_reply_to_tweetid',
                                              'userid', 'tweet_text'
                                             ]]
    df_text_user_2 = df_text_user_2.sort_values(by=['in_reply_to_userid', 
                                      'in_reply_to_tweetid',
                                      'userid', 'tweet_text'
                                     ])

    df_text = df_text_user_1.merge(df_text_user_2, 
                              on=['in_reply_to_userid', 
                                  'in_reply_to_tweetid'],
                             )

    df_text = df_text.loc[df_text['userid_x'] != df_text['userid_y']]

    df_text = df_text.fillna(0)

    df_text = df_text.astype({
        'in_reply_to_userid': int,
        'in_reply_to_tweetid': int
    })

    # df_text

    df_text.to_csv('test_reply_coordination_manual_insp.csv')

NameError: name 'create_folder' is not defined