##### **This notebook manually verifies whether there is coordinated reply**

In [42]:
import pandas as pd
import numpy as np
import datetime
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from tqdm import tqdm
import sys
import os

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as vz
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config
import helper.pandas_helper as pd_hp

##### **Configuration files**

In [43]:
config = config.config()
path = config['PATHS']

derived_path = path['derived_path']
all_tweet_data = path['all_tweet_data']
plot_path = path['plot_path']
external_reply = path['external_reply']


file_hp.create_folder(plot_path, 'reply_count')
reply_path = os.path.join(plot_path, 'reply_count')

##### **Bundle for all Campaign**

In [3]:
all_campaigns, names = st.bundle_campaign()

##### **Test campaign**

In [4]:
year = '2020_12'
campaign = 'iran_202012'
type_of = 'ops'

data_path = st.get_data_path(all_tweet_data, 
                                 year, 
                                 campaign)

df_iran = st.read_ops_control_data(data_path['ops'],
                                data_path['control'], 
                             [type_of])['ops']

In [5]:
df_grps = (df_iran.groupby(['in_reply_to_userid',
                            'in_reply_to_tweetid'])['userid'].nunique()
                              # .size()
                              .to_frame('count_replies')
                              .reset_index())

In [6]:
len(df_grps.loc[df_grps['count_replies'] >= 5])

23

In [7]:
df_grps_userid = (df_iran.groupby(['in_reply_to_userid',
                                       'userid'
                                      ])['in_reply_to_tweetid'].nunique()
                              # .size()
                              .to_frame('count_replies')
                              .reset_index())

In [8]:
(df_grps_userid.loc[df_grps_userid['count_replies'] >= 5]
 .sort_values(by=['count_replies'],
              ascending=False)
).head(2)

Unnamed: 0,in_reply_to_userid,userid,count_replies
16308,8.847826e+17,884782586328776709,1673
24334,1.169137e+18,1169137349508382720,1662


##### **All campaigns considered**

In [9]:
def plot_histogram(df, xlabel, ylabel,
                   plot_path, title
                  ):
    fig, ax = plt.subplots()
    num_bins = max(df['count_replies'])

    n = ax.hist(df['count_replies'],
                               num_bins, )
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)

    plt.yscale('log')
    plt.title(title)
    plt.xscale('log')
    fig.savefig(f'{plot_path}/{title}', 
              facecolor='white', 
              transparent=False)
    plt.show()

In [11]:
def test_count(path, external=True,
               threshold = 10):
    type_of = 'ops'
    info =[]

    for row in all_campaigns:
        for year in row:
            for new_campaign in row[year]:

                print(f'\n ------START: {year}: {new_campaign} ------- \n')

                data_path = st.get_data_path(all_tweet_data, year, 
                                     new_campaign)

                data = st.read_ops_control_data(data_path['ops'],
                                                data_path['control'], 
                                             [type_of])
                df_w = data[type_of]

                if 'tweet_language' not in df_w.columns:
                    continue

                # df_test = df_w.loc[df_w['tweet_language'] == 'en']
                df_test = df_w
                
                df = df_test.loc[~df_test['in_reply_to_tweetid'].isnull()]
                
                if external == True:
                    df = st.reply_to_external_users(df)

                # print('Eng lang replies ', len(df))
                
                df_grps = (df.groupby([
                    # 'in_reply_to_userid',
                                       'in_reply_to_tweetid'
                                      ])['userid']
                              .nunique()
                              .to_frame('count_replies')
                              .reset_index())
                
                df_grps_size = df_grps.loc[df_grps['count_replies'] >= threshold]
                total = len(df_grps_size)
                
                print(f'Greater than equal to {threshold} :', 
                      total)
                
                if total == 0:
                    min_val = 0
                    max_val = 0
                else:
                    min_val = min(df_grps_size['count_replies'])
                    max_val = max(df_grps_size['count_replies'])

                info.append([year,
                             new_campaign,
                             total,
                             min_val,
                             max_val
                            ])
                
                if len(df_grps_size) == 0:
                    continue

                xlabel = 'Number of users replying to each tweet'
                ylabel = 'Raw count of tweets'

                print('Min no. of reply ', 
                      min(df_grps_size['count_replies']))
                print('Max no. of reply ', 
                      max(df_grps_size['count_replies']))

    (pd.DataFrame(data=info,
                  columns=['year', 'campaign', 
                           f'total_number_of_tweet_with_{threshold}_or_more_replies', 
                           'min','max']
                 )
     .to_csv(f'{path}')
    )

#### **Number of tweets with 5 or more replies in english: 1977**
#### **Number of tweets with 10 or more replies in english: 351**

In [14]:
filename = 'replies_5_english_tweet_level_reply_info.csv'
filename ='5_tweet_level_reply_info.csv'

# test_count(filename, 
#            external=True, 
#            threshold = 5)

In [16]:
df_5 = pd.read_csv(filename)

print('Number of tweets with 5 or more replies :',
      df_5['total_number_of_tweet_with_5_or_more_replies'].sum())

Number of tweets with 5 or more replies : 94630


##### **10 or more replies**

In [20]:
filename ='10_or_more_replies_external_info.csv'

# test_count(filename, 
#            external=True, 
#            threshold = 10)

In [23]:
df_10 = pd.read_csv(filename)

print('Number of tweets with 10 or more replies :',
      df_10['total_number_of_tweet_with_10_or_more_replies'].sum())

Number of tweets with 10 or more replies : 30816


#### **Testing the time period of between consequtive replies**

In [24]:
import imp

imp.reload(st)

df_replies = pd.read_pickle(external_reply)

  import imp


In [25]:
df_replies.columns

Index(['replier_tweetid', 'replier_userid', 'poster_tweetid', 'poster_userid',
       'tweet_language', 'tweet_time', 'year', 'campaign'],
      dtype='object')

In [37]:
df_replies = df_replies.drop_duplicates()

df_count = (df_replies.groupby(['poster_tweetid'])['replier_userid']
             .nunique()
             .to_frame('replier_count')
             .reset_index()
            )

In [38]:
df_count_5 = df_count.loc[df_count['replier_count'] >= 5]

In [39]:
print('Total tweets with 5 or more replies: ', a
      len(df_count_5))

Total tweets with 5 or more replies:  96041


##### **Saving the conversation ids with greater than equal to 5 replies**

In [47]:
df_count_5 = df_count_5.astype({'poster_tweetid': int})

In [48]:
df_count_5.head()

Unnamed: 0,poster_tweetid,replier_count
189165,64953578015571968,7
319572,138715327608524800,7
398962,175649582045347840,8
400677,176283333385396224,5
403667,177373540574695424,5


In [57]:
conversation_path = path['conversation_path']
rows = df_count_5['poster_tweetid']
file_name = '5_or_more_replies_conversation.txt'

file_hp.write_to_file_row_each_line(conversation_path, 
                            file_name, 
                            rows)

In [56]:
df_count_5.loc[df_count_5['poster_tweetid'].isnull()]

Unnamed: 0,poster_tweetid,replier_count


##### **Time duration of replies**

In [77]:
df_time = df_replies.loc[
    df_replies['poster_tweetid'].isin(df_count_5['poster_tweetid'])]

In [78]:
df_time = df_time.astype({
    'poster_tweetid': int
})

df_time['tweet_time'] =  pd.to_datetime(df_time['tweet_time'])

In [79]:
df_time = df_time.sort_values(by=['tweet_time'], 
                    ascending=True).drop_duplicates()

In [84]:
df_time['diff'] = df_time.groupby('poster_tweetid')['tweet_time'].diff()

In [99]:
df_time = df_time.dropna(subset=['diff'])

In [115]:
print('Min time diff :', df_time['diff'].min())
print('Max time diff :', df_time['diff'].max())

Min time diff : 0 days 00:00:00
Max time diff : 624 days 17:42:00


In [131]:
print('Total unique conversations :', len(df_count_5))
print('Total data points with time difference data :', len(df_time))
print('Time difference in consecutive replying less than 60 min filtering :', 
      len(df_time.loc[df_time['diff'] <= datetime.timedelta(minutes=60)]))

Total unique conversations : 3475
Total data points with time difference data : 30773
Time difference in consecutive replying less than 60 min filtering : 26565


##### **Considering time difference between first and last reply**

In [129]:
df_first_last = pd_hp.read_first_last_row_of_grp(df_time, 
                                                 'tweet_time',
                                                 'tweet_time'
                                                )

df_first_last['diff'] = df_first_last.groupby('poster_tweetid')['tweet_time'].diff()
df_first_last = df_first_last.dropna(subset=['diff'])
total_60 = df_first_last.loc[df_first_last['diff'] <= datetime.timedelta(minutes=60)]['poster_tweetid'].nunique()

print('Total conversation after droping the null :', 
      df_first_last['poster_tweetid'].nunique())

print('Min difference in first and last reply: ', 
      df_first_last['diff'].min())
print('Max difference in first and last reply: ',
      df_first_last['diff'].max())
print('Total conversation with replies within 60 mins :', total_60)

Total conversation after droping the null : 3419
Min difference in first and last reply:  0 days 00:00:00
Max difference in first and last reply:  624 days 17:42:00
Total conversation with replies within 60 mins : 3318


In [119]:
bins = np.arange(0, 1000, 60)

pd.cut(df_time['diff'], 
       bins=[pd.Timedelta(f'{i}s') for i in bins]+[pd.Timedelta.max],
       labels=[f'>{i}s' for i in bins])


380580    >960s
380652      >0s
380624      >0s
380558     >60s
224777      NaN
          ...  
98154     >960s
99637     >960s
98455     >960s
99575     >960s
99491     >960s
Name: diff, Length: 30773, dtype: category
Categories (17, object): ['>0s' < '>60s' < '>120s' < '>180s' ... '>780s' < '>840s' < '>900s' < '>960s']

In [None]:
def time_binned_histogram(df, column='diff'):
    

### **Isolation Forest for iran campaign**

In [31]:
year = '2020_12'
campaign = 'iran_202012'
type_of = 'ops'

data_path = st.get_data_path(all_tweet_data, 
                                 year, 
                                 campaign)

df_iran = st.read_ops_control_data(data_path['ops'],
                                data_path['control'], 
                             [type_of])['ops']

In [32]:
from sklearn.ensemble import IsolationForest

In [34]:
df_w = df_iran

if 'tweet_language' not in df_w.columns:
    print('No language column')

# df_test = df_w 
df_test = df_w.loc[df_w['tweet_language'] == 'en']

df = df_test.loc[~df_test['in_reply_to_tweetid'].isnull()]

df = st.reply_to_external_users(df)

df_grps = (df.groupby(['in_reply_to_userid',
                       'in_reply_to_tweetid'
                      ])['userid']
              .nunique()
              .to_frame('count_replies')
              .reset_index())

# threshold = 10
# df_grps_size = df_grps.loc[df_grps['count_replies']>=threshold]
# total = len(df_grps_size)
                

In [37]:
iforest = IsolationForest(n_estimators = 100, 
                          contamination = 0.03,
                          max_samples ='auto')
prediction = iforest.fit_predict(df_grps[['count_replies']])
print(prediction[:20])
print("Number of outliers detected: {}".format(prediction[prediction < 0].sum()))
print("Number of normal samples detected: {}".format(prediction[prediction > 0].sum()))

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Number of outliers detected: -559
Number of normal samples detected: 28851


In [39]:
df_grps['prediction'] = prediction

In [42]:
df_grps.loc[df_grps['prediction'] == -1]['count_replies'].unique()

array([2, 3, 4, 6])

### **Isolation Forest for all replies**

In [81]:
replies_to_external_user = 'all_external_replies_campaign.pkl.gz'

all_replies = os.path.join(derived_path, 
                           replies_to_external_user)

In [83]:
df_all = pd.read_pickle(all_replies)

print('All replies :', len(df_all))

df_all = df_all.loc[df_all['tweet_language'] == 'en']

print('Only english replies :', len(df_all))

All replies : 1060042
Only english replies : 1060042


In [85]:
df_all['tweet_language'].unique()

array(['en'], dtype=object)

In [89]:
df_all.head()

Unnamed: 0,replier_tweetid,replier_userid,poster_tweetid,poster_userid,tweet_language,tweet_time,year,campaign,tweet_time_year
0,1214512781459771394,1181565607675756544,1173568360870629376,355989100.0,en,2020-01-07 11:43:00,2021_12,CNHU_0621,2020-01-07
1,1265941843696422915,1181763761591308289,1265907888381480960,599065100.0,en,2020-05-28 09:43:00,2021_12,CNHU_0621,2020-05-28
2,1250993547488026624,1181763761591308289,1250547774111784960,8.652669e+17,en,2020-04-17 03:44:00,2021_12,CNHU_0621,2020-04-17
3,1256894720732565505,1181565607675756544,1256583556786270208,473970200.0,en,2020-05-03 10:33:00,2021_12,CNHU_0621,2020-05-03
4,1299265857063809024,1181565607675756544,1299020570445656064,202610300.0,en,2020-08-28 08:41:00,2021_12,CNHU_0621,2020-08-28


In [86]:
df_all = df_all.astype({
    'poster_tweetid': int
    })

df_all = st.add_YYYY_MM_DD(df_all)

In [90]:
df_all_grp = (df_all.groupby(['poster_tweetid'])['replier_userid']
              .nunique()
              .to_frame('count_replies')
              .reset_index())

print('Total data after grouping ', len(df_all_grp))

Total data after grouping  729936


In [92]:
total_5 = len(df_all_grp.loc[df_all_grp['count_replies'] >= 5])

print('Tweet with 5 or more replies ', total_5)

Tweet with 5 or more replies  1994


In [93]:
print('Maximum replies :', df_all_grp['count_replies'].max())

Maximum replies : 108


In [94]:
iforest = IsolationForest(n_estimators = 100, 
                          contamination = 0.03,
                          max_samples ='auto')
prediction = iforest.fit_predict(df_all_grp[['count_replies']])

df_all_grp['prediction'] = prediction

In [96]:
df_all_grp.loc[df_all_grp['prediction'] == -1]['count_replies'].unique()

array([  2,   3,   5,   7,   4,   6,  10,   8,  11,   9,  15,  43,  46,
        26,  14,  28,  13,  16,  31,  33,  12,  30,  22,  35,  20,  34,
        19,  37,  21,  23,  18,  17,  90,  38,  40,  48,  25,  24, 102,
        77,  63,  81,  44,  27,  56,  45,  53,  52,  47,  32, 108,  64])

In [None]:
### Get all conversation_ids/tweets with more and equal than 5