##### **This notebook contains information about the control and IO datasets**

In [1]:
import pandas as pd
import numpy as np
import warnings
import glob
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as pltc
from tqdm import tqdm
import sys
import os
import networkx as nx
import datetime

import importlib

#### packages
import helper.strategy_helper as st
import helper.visualization as vz
import helper.helper as hp
import helper.file_helper as file_hp
import config.config as config

In [2]:
all_campaigns, names = st.bundle_campaign()

In [5]:
total = 0
for element in all_campaigns:
    for key in element:
        total = total + len(element[key])
print('Total IO Datasets :', total)

Total IO Datasets : 43


In [6]:
config = config.config()
path = config['PATHS']

derived_path = path['derived_path']
all_tweet_data = path['all_tweet_data']
plot_path = path['plot_path']

In [7]:
year = '2020_12'
campaign = 'iran_202012'
type_of = 'ops'

##### **IO data**

In [8]:
data_path = st.get_data_path(all_tweet_data, 
                                 year, 
                                 campaign)

df = st.read_ops_control_data(data_path['ops'],
                                data_path['control'], 
                             [type_of])['ops']

In [None]:
#Checking time

In [13]:
print('Min time: ', df['tweet_time'].min())
print('Max time: ', df['tweet_time'].max())

Min time:  2009-09-06 13:56
Max time:  2020-12-27 18:08


### **Checking for single account if the data is its whole timeline**

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560571 entries, 0 to 560570
Data columns (total 30 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   tweetid                   560571 non-null  int64  
 1   userid                    560571 non-null  int64  
 2   user_display_name         560571 non-null  object 
 3   user_screen_name          560571 non-null  object 
 4   user_reported_location    417523 non-null  object 
 5   user_profile_description  530518 non-null  object 
 6   user_profile_url          338535 non-null  object 
 7   follower_count            560571 non-null  int64  
 8   following_count           560571 non-null  int64  
 9   account_creation_date     560571 non-null  object 
 10  account_language          560571 non-null  object 
 11  tweet_language            444758 non-null  object 
 12  tweet_text                560571 non-null  object 
 13  tweet_time                560571 non-null  o

In [18]:
df_user = df.loc[df['userid'] == 1262795522206773251]

Min time:  2009-09-06 13:56
Max time:  2020-12-27 18:08


In [24]:
df = st.add_YYYY_MM_DD(df)

In [26]:
print('Min time: ', df['tweet_time_year'].min())
print('Max time: ', df['tweet_time_year'].max())

Min time:  2009-09-06
Max time:  2020-12-27


In [31]:
print('Min account creation date :',
      df['account_creation_date'].min())

Min account creation date : 2009-09-06


In [34]:
df.loc[df['account_creation_date'] == df['account_creation_date'].min()]['userid'].unique()

array([72037886])

In [35]:
df_user = df.loc[df['userid'] == 72037886]

In [47]:
df_user['tweet_time_year'].unique()

array(['2018-01-02', '2017-02-25', '2018-06-13', ..., '2016-10-09',
       '2020-05-14', '2020-03-19'], dtype=object)

In [42]:
 df.loc[df['userid'] == 1313773810504654849]['account_creation_date'].unique()

array(['2020-10-07'], dtype=object)

In [46]:
df.loc[df['userid'] == 1313773810504654849]['tweet_time_year'].unique()

array(['2020-10-08', '2020-10-15', '2020-10-16', '2020-10-12',
       '2020-10-18', '2020-10-07', '2020-10-09', '2020-10-20',
       '2020-10-11', '2020-10-14', '2020-10-13', '2020-10-10',
       '2020-10-17', '2020-10-19'], dtype=object)

In [45]:
## This shows that the data is the whole timeline of all the accounts
# that were suspended

##### **Testing for control dataset**

In [51]:
type_of = 'control'

df_control = st.read_ops_control_data(data_path['ops'],
                                data_path['control'], 
                             [type_of])['control']

In [56]:
df_control.columns

Index(['account_creation_date', 'user_display_name', 'userid',
       'user_screen_name', 'user_profile_url', 'user_profile_description',
       'follower_count', 'following_count', 'user_reported_location',
       'tweetid', 'tweet_text', 'tweet_language', 'geo', 'place',
       'in_reply_to_tweetid', 'in_reply_to_screen_name', 'in_reply_to_userid',
       'tweet_time', 'user_mentions', 'hashtags', 'symbols', 'urls',
       'tweet_client_name', 'is_retweet', 'retweeted_tweetid',
       'retweeted_user_id', 'tweet_time_year'],
      dtype='object')

In [54]:
print('Min time: ', df_control['tweet_time'].min())
print('Max time: ', df_control['tweet_time'].max())

Min time:  2010-10-21 00:34:24+00:00
Max time:  2020-12-27 23:59:58+00:00


In [57]:
print('Min account creation date :',
      df_control['account_creation_date'].min())

Min account creation date : Fri Apr 01 03:04:52 +0000 2016


In [60]:
df_control.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729241 entries, 0 to 729240
Data columns (total 27 columns):
 #   Column                    Non-Null Count   Dtype              
---  ------                    --------------   -----              
 0   account_creation_date     729241 non-null  object             
 1   user_display_name         729241 non-null  object             
 2   userid                    729241 non-null  object             
 3   user_screen_name          729241 non-null  object             
 4   user_profile_url          729241 non-null  object             
 5   user_profile_description  729241 non-null  object             
 6   follower_count            729241 non-null  int64              
 7   following_count           729241 non-null  int64              
 8   user_reported_location    729241 non-null  object             
 9   tweetid                   729241 non-null  object             
 10  tweet_text                729241 non-null  object             
 11  

In [62]:
# df_control['account_creation_date'] = pd.to_datetime(df_control['account_creation_date'], 
#                                         format="%Y-%m-%d, %H:%M:%S")

In [65]:
df_control.loc[df_control['tweet_time'] == df_control['tweet_time'].max()]

Unnamed: 0,account_creation_date,user_display_name,userid,user_screen_name,user_profile_url,user_profile_description,follower_count,following_count,user_reported_location,tweetid,...,tweet_time,user_mentions,hashtags,symbols,urls,tweet_client_name,is_retweet,retweeted_tweetid,retweeted_user_id,tweet_time_year
728608,Thu Apr 30 13:15:41 +0000 2020,News Sound Bites,1255848224574763009,bites_sound,,A 🤖 curating news that keeps you informed with...,57,18,0,1343345914820370432,...,2020-12-27 23:59:58+00:00,"[{'id': '256833444', 'screen_name': 'Annastaci...",[],[],[],Twitter,True,1343333089360003073,256833444,2020-12-27
728707,Sat Jul 11 04:26:01 +0000 2009,Ray Chick,55756918,chickr,http://t.co/8Vtaab2DwA,,184,721,Colorado,1343345913977163776,...,2020-12-27 23:59:58+00:00,"[{'id': '18166778', 'screen_name': 'Jim_Jordan...","[{'text': 'COVID19', 'indices': [20, 28]}]",[],[],Twitter Web App,True,1343266698745491456,18166778,2020-12-27
728723,Fri Apr 25 20:18:17 +0000 2008,dar_bish,14533436,dar_bish0,,,52,128,"Washington, DC",1343345912509308929,...,2020-12-27 23:59:58+00:00,"[{'id': '12272322', 'screen_name': 'PoliticsRe...","[{'text': 'COVID19', 'indices': [51, 59]}]",[],[],Twitter for iPhone,True,1343333495985352704,12272322,2020-12-27
728735,Mon May 30 15:50:26 +0000 2011,bitcoin connection,307956719,bitcoinconnect1,http://t.co/8fW7QtwhKR,,66,0,0,1343345912165371904,...,2020-12-27 23:59:58+00:00,[],"[{'text': 'nintendoswitch', 'indices': [110, 1...",[],"[{'url': 'https://t.co/T9ZWaoBmzN', 'expanded_...",Twitter Web App,False,0,0,2020-12-27
728737,Wed Feb 25 18:48:36 +0000 2009,Felicia Dugan,21901162,themessage8,,"Dog Lover and Rescuer, Hope dealer, Giving bac...",1868,2236,"Tulsa, Oklahoma",1343345911297142785,...,2020-12-27 23:59:58+00:00,"[{'id': '1275285589818998786', 'screen_name': ...","[{'text': 'COVID19', 'indices': [105, 113]}]",[],[],Twitter for iPhone,True,1343342548027322369,1275285589818998786,2020-12-27


In [69]:
df_control.loc[df_control['userid'] == '1255848224574763009']['tweet_time_year']

728608    2020-12-27
728609    2020-12-27
728610    2020-12-27
728611    2020-12-27
728612    2020-12-27
             ...    
728702    2020-12-27
728703    2020-12-27
728704    2020-12-27
728705    2020-12-27
728706    2020-12-27
Name: tweet_time_year, Length: 99, dtype: object

In [70]:
## Maximum 100 tweets that were posted on the same day using same hashtags