##### **This notebook gets information on data sets (external replies only considered)**

In [205]:
import pandas as pd
import numpy as np
import datetime
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
import json
import sys
import os

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as vz
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config_hp
import helper.pandas_helper as pd_hp

##### **Load config files**

In [199]:
config = config.config()
path = config['PATHS']
poster_path = config['POSTER_PATH']

external_reply = path['external_reply']
conversation_ids_5 = path['conversation_ids_5']
poster_id_path = poster_path['poster_id_path']
poster_info_file = poster_path['poster_info_file']

##### **Information on all replies**

In [56]:
df_replies = pd.read_pickle(external_reply)

In [57]:
total = df_replies['replier_tweetid'].nunique()

In [58]:
print('Total replies in data :', total)

Total replies in data : 17873714


In [60]:
df_tweets = (df_replies.groupby(['poster_tweetid',
                                  # 'poster_userid'
                                 ])['replier_userid']
                              .nunique()
                              .to_frame('count_replies')
                              .reset_index())
total_tweet = len(df_tweets)

print('Total tweets in datasets :', total_tweet)

Total tweets in datasets : 15256547


In [71]:
total_posters = df_replies['poster_userid'].nunique()

print('Total posters in all data :', total_posters)

Total posters in all data : 1763084


##### **Information on number of tweets with 5 or more replies**

In [None]:
conv_ids = file_hp.read_file(conversation_ids_5)

In [25]:
print('Total number of tweets with 5 or more replies :', 
      len(conv_ids))

Total number of tweets with 5 or more replies : 96041


##### **Information on number of tweets with 10 or more replies**

In [65]:
df_replies = pd.read_pickle(external_reply)

In [None]:
df_replies = df_replies.astype({
    'poster_tweetid': str,
    'poster_userid': str
})

In [67]:
df_count = (df_replies.groupby(['poster_tweetid',
                                  'poster_userid'
                                 ])['replier_userid']
                              .nunique()
                              .to_frame('count_replies')
                              .reset_index())

In [68]:
total_10 = len(df_count.loc[df_count['count_replies'] >= 10])

print('Total tweets with 10 or more replies from IO accounts :', total_10)

Total tweets with 10 or more replies from IO accounts : 31301


##### **Verifying 5 or more replies**

In [73]:
df_5 = df_count.loc[df_count['count_replies'] >= 5]

print('Total tweets with 5 or more replies from IO accounts :', 
      len(df_5))

Total tweets with 5 or more replies from IO accounts : 96040


In [80]:
df_5 = df_5.astype({
    'poster_tweetid': str,
    'poster_userid': str
})

In [81]:
posters_org = df_5['poster_userid'].unique()

print('Total unique posters with 5 or more replies :',
      len(posters_org))

Total unique posters with 5 or more replies : 15014


##### **Information on number of posters that got 5 or more replies on their tweets**

In [83]:
poster_ids = file_hp.read_file(poster_id_path)


print('Remaining posters : ', set(poster_ids) - set(posters_org))
print('Total number of posters that got 5 or more replies :', 
      len(poster_ids))

Remaining posters :  {'1134445195863437312', '757779763406704640'}
Total number of posters that got 5 or more replies : 15016


##### **Information on status of posters**

In [40]:
df_poster = pd.read_pickle(poster_info_file)

In [227]:
# df_poster.head()

In [45]:
df_poster['verified'].unique()

array(['not found', 'suspended', True, False], dtype=object)

In [44]:
suspended = len(df_poster.loc[df_poster['verified'] == 'suspended'])

print('Suspended accounts :', suspended)

Suspended accounts : 5041


In [47]:
not_found = len(df_poster.loc[df_poster['verified'] == 'not found'])

print('Number of accounts not found :', not_found)

Number of accounts not found : 3992


In [49]:
verified = len(df_poster.loc[df_poster['verified'] == True])

print('Verified accounts :', verified)

Verified accounts : 2031


In [48]:
not_verified = len(df_poster.loc[df_poster['verified'] == False])

print('Not verified accounts :', not_verified)

Not verified accounts : 3952


##### **Information on poster tweets**

In [89]:
config = config.config()

poster_path = config['POSTER_PATH']

In [90]:
parsed_poster_org_tweets = poster_path['parsed_poster_org_tweets']

In [91]:
df_org = pd.read_pickle(parsed_poster_org_tweets)

In [130]:
df_org['tweetid'].nunique()

96041

In [126]:
total_not_found = df_org.loc[df_org['created_at'] == 'Not Found Error']['tweetid'].nunique()

In [127]:
print('Total deleted tweets :', total_not_found)

Total deleted tweets : 43048


In [123]:
total_auth = df_org.loc[df_org['created_at'] == 'Authorization Error']['tweetid'].nunique()

In [124]:
print('Total tweets are not authorized to view  :', 
      total_auth)

Total tweets are not authorized to view  : 18808


In [115]:
print('Remaining tweets :', 96401 - 43048 - 18808)

Remaining tweets : 34545


In [141]:
43048 + 18808

61856

In [152]:
df_remaining_tweets = df_org.loc[~df_org['author_id'].isnull()]

In [153]:
df_remaining_tweets['tweetid'].nunique()

34185

##### **Remove unalive poster tweet ids**

In [150]:
config = config.config()

path = config['PATHS']

splited_reply_ids = path['splited_reply_ids']
poster_path = config['POSTER_PATH']
parsed_poster_org_tweets = poster_path['parsed_poster_org_tweets']

In [154]:
df_org = pd.read_pickle(parsed_poster_org_tweets)

In [169]:
df_live = df_org.loc[~df_org['author_id'].isnull()]
org_tweets = set(df_live['tweetid'].unique().tolist())

In [191]:
def remove_unalive_tweets(path, save_path, org_list):
    '''
    Removes unwanted rows from file
    :param path: Path where files are
    :param save_path: Path where the files to be saved
    :param org_list: List of original data without unwanted rows
    '''
    for file in glob.glob(path):
        filename = file.split(os.sep)[-1]
        ids = set(file_hp.read_file(file))
        remaining = ids - org_list #unalive tweets
        
        if len(remaining) == 0:
            print(0)
            continue
            
        ids = ids - remaining
        temp_file = open(f'{file}', 'r+')
        
        temp_file.truncate(0)
        
        file_hp.write_to_file_row_each_line(save_path, 
                                            filename, 
                                            ids)

            
id_path = splited_reply_ids + os.sep + '*.txt'

# remove_unalive_tweets(id_path, splited_reply_ids, org_tweets)

In [190]:
def remove_empty_files(path):
    '''
    Checks the number of ids in txt files
    :param path: Path of files
    '''
    for file in glob.glob(path):
        ids = set(file_hp.read_file(file))
        print(file, len(ids))
        
id_path = splited_reply_ids + os.sep + '*temp_1.txt'

# remove_empty_files(id_path)

##### **Save dead tweetids**

In [192]:
df_org = pd.read_pickle(parsed_poster_org_tweets)

In [193]:
df_dead = df_org.loc[df_org['author_id'].isnull()]
dead_tweets = set(df_dead['tweetid'].unique().tolist())

In [218]:
import imp

imp.reload(config_hp)
imp.reload(file_hp)

config = config_hp.config()

poster_path = config['POSTER_PATH']
poster_dead_tweet_file = poster_path['poster_dead_tweet_file']

In [208]:
# file_hp.write_to_file_row_each_line(poster_dead_tweet_file,
#                                     None, 
#                                     dead_tweets)

##### **Save unalive posters**

In [225]:
import imp

imp.reload(config_hp)
imp.reload(file_hp)

config = config_hp.config()

poster_path = config['POSTER_PATH']

poster_info_file = poster_path['poster_info_file']
poster_dead_file = poster_path['poster_dead_file']
poster_id_path = poster_path['poster_id_path']

In [213]:
df_poster = pd.read_pickle(poster_info_file)

In [216]:
df_alive_posters = df_poster.loc[~df_poster['verified'].isin(
    ['suspended', 'not found'])]

alive_posters = df_alive_posters['id'].unique().tolist()

In [217]:
# file_hp.write_to_file_row_each_line(poster_dead_file,
#                                     None, 
#                                     alive_posters)

In [226]:
file_hp.remove_row_in_file(poster_id_path, alive_posters)