#### **This notebook saves tweet metrics**

In [1]:
import pandas as pd
import numpy as np
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from tqdm import tqdm
import sys
import os

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as vz
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config_hp

#### **Load files**

In [2]:
config = config_hp.config()
path = config['PATHS']

derived_path = path['derived_path']
all_tweet_data = path['all_tweet_data']
plot_path = path['plot_path']

In [3]:
file_hp.create_folder(plot_path, 'mix_replies')
reply_plot_path = os.path.join(plot_path, 'mix_replies')

In [4]:
all_campaigns, names = st.bundle_campaign()

In [5]:
def get_path(path, 
             year, 
             campaign,
            ):
    '''
    Gets path to control and IO tweet file
    
    :param path: path to the directory where tweet is present
    :param year: year of compaign
    :param campaign: name of campaign
    :param check_control: whether to check if control is present
    
    return list
    '''
    
    ops_file_path = os.path.join(path, 
                                 year, 
                                 campaign, 
                                 f'{campaign}_tweets.pkl.gz')
    control_file_path = os.path.join(path, 
                                     year, 
                                     campaign, 
                                     'DriversControl', 
                                     f'{campaign}_control.pkl.gz')
    
    if os.path.isfile(control_file_path) == False:
        control_file_path = os.path.join(path, 
                                     year, 
                                     campaign, 
                                     'DriversControl', 
                                     f'{campaign}_tweets_control.pkl.gz')
        
    ops_flag = os.path.isfile(ops_file_path) == True
    
    if ops_flag:
        return {'ops': ops_file_path,
                'control': control_file_path
               }
    
    print('Files not found')
    
    return None

In [21]:
def combine_all_replies(all_campaigns, 
                        derived_path, 
                        type_of, 
                        filename, 
                        external=False,
                        language=None
                       ):
    '''
    Combines all the replies of data
    
    :param all_campaigns: dictionary of all campaign
    :param derived_path: path where the data to be saved
    :param type_of: type of data either information operation or control
    :param filename: name of file to be saved
    '''

    df = pd.DataFrame()
    for row in all_campaigns:
        for year in row:
            for new_campaign in row[year]:

                print(f'\n ------START: {year}: {new_campaign} ------- \n')

                data_path = get_path(all_tweet_data, year, 
                                     new_campaign)

                data = st.read_ops_control_data(data_path['ops'],
                                                data_path['control'], 
                                             [type_of])
                df_test = data[type_of]
                
                if len(df_test) == 0:
                    print(f'\n ------END: {year}: {new_campaign} ------- \n')
                    continue
                    
                if 'in_reply_to_tweetid' not in df_test.columns:
                    continue

                df_ops = df_test.loc[~df_test['in_reply_to_tweetid'].isnull()]
                
                # if language is not None:
                #     df_ops = df_ops.loc[df_ops['tweet_language'] == language]
                    
                df_ops['year'] = year
                df_ops['campaign'] = new_campaign
                
                if external == True:
                    df_ops = st.reply_to_external_users(df_ops)
                    
                print(df_ops.columns)
                
                return 
            
                df = df.append(df_ops[['tweetid', 
                                       'userid',
                                       'quote_count',
                                       'reply_count',
                                       'like_count',
                                       'retweet_count',
                                       'year', 
                                       'campaign']], 
                               ignore_index=True)
                

                print(len(df))
                
    df.rename(columns={'tweetid': 'replier_tweetid',
                        'userid': 'replier_userid',
                        'quote_count': 'quote_count',
                        'reply_count': 'reply_count',
                        'like_count': 'like_count',
                       'retweet_count': 'retweet_count',
                        'year': 'year',
                        'campaign': 'campaign'
                       },
               inplace=True)
    
    # path = os.path.join(derived_path, filename)

    df.to_pickle(f'{filename}')
                

#### **Run the function**

In [22]:
importlib.reload(config_hp)

config = config_hp.config()
metric = config['METRIC']

tweet_metric = metric['tweet_metric']
combine_all_replies(all_campaigns, 
                    derived_path, 
                    'ops', 
                    tweet_metric,
                    False,
                    None
                   )


 ------START: 2021_12: CNHU_0621 ------- 

Index(['tweetid', 'userid', 'user_display_name', 'user_screen_name',
       'user_reported_location', 'user_profile_description',
       'user_profile_url', 'follower_count', 'following_count',
       'account_creation_date', 'account_language', 'tweet_language',
       'tweet_text', 'tweet_time', 'tweet_client_name', 'in_reply_to_userid',
       'in_reply_to_tweetid', 'quoted_tweet_tweetid', 'is_retweet',
       'retweet_userid', 'retweet_tweetid', 'latitude', 'longitude',
       'quote_count', 'reply_count', 'like_count', 'retweet_count', 'hashtags',
       'urls', 'user_mentions', 'year', 'campaign'],
      dtype='object')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ops['year'] = year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_ops['campaign'] = new_campaign


#### **Test file is there**

In [9]:
importlib.reload(config_hp)

config = config_hp.config()
metric = config['METRIC']

tweet_metric = metric['tweet_metric']


df_metric = pd.read_pickle(tweet_metric)

In [11]:
df_metric.head()

Unnamed: 0,replier_tweetid,replier_userid,quote_count,reply_count,like_count,retweet_count,year,campaign
0,1214512781459771394,1181565607675756544,0,0,1,1,2021_12,CNHU_0621
1,1321737129039290368,1291564118487339011,0,0,1,0,2021_12,CNHU_0621
2,1257247935470821376,1205428631473778689,0,0,0,0,2021_12,CNHU_0621
3,1308232949108670464,1181565607675756544,0,0,0,0,2021_12,CNHU_0621
4,1265941843696422915,1181763761591308289,0,0,0,0,2021_12,CNHU_0621


In [12]:
df_metric['quote_count'].max()

1592.0

In [15]:
df_metric['reply_count'].max()

2008.0

In [16]:
df_metric['like_count'].max()

37518.0

In [17]:
df_metric['retweet_count'].max()

33616.0

In [27]:
# df_metric.loc[df_metric['retweet_count'].isnull()]