# Time slice analysis

python=3.7

|topic      |platform   |language   |
|-----------|-----------|-----------|
|QAnon  |Twitter    |en         |

In [None]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

name_suffix = "[v2][debunking=keywords][lang=en][topic=QAnon][platform=Twitter]"

## Load debunking retweets

In [2]:
df_rt = pd.read_csv(f"data/retweet{name_suffix}.csv")
df_rt.shape

(102104, 83)

## Group by date

In [3]:
df_rt['text'] = df_rt['text'].astype(str)
# Convert 'created_at' column to datetime
df_rt['created_at'] = pd.to_datetime(df_rt['created_at'])
# Extract date from 'created_at' column
df_rt['date'] = df_rt['created_at'].dt.date
# Group by date
grouped_df = df_rt.groupby('date')

# In each time slice, aggregate texts for each user
time_slices = dict()
for name, df in grouped_df:
    time_slices[name] = df.groupby(by='author.username').agg(text=("text", lambda x: ' '.join(set(x))))

len(time_slices)

384

In [4]:
# Convert time_slices from dict into DataFrame
for k, v in time_slices.items():
    v['date'] = k

df_merge_slices = pd.concat(time_slices.values())
df_merge_slices.sort_values(by='date', inplace=True)
df_merge_slices['author.username'] = df_merge_slices.index

Simple text cleaning

In [None]:
import html
def simple_text_cleaning(text):
    text = re.sub(r'https?://\S+|www\.\S+', '', text) 
    text = html.unescape(text) 
    text = re.sub(r'<.*?>+', '', text) 
    return text

df_txt = df_merge_slices

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=10)
df_txt['text_simply_cleaned'] = df_txt['text'].parallel_apply(simple_text_cleaning)

df_merge_slices = df_txt

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=9968), Label(value='0 / 9968'))), …

In [6]:
df_merge_slices.to_csv(f"data/time_slices{name_suffix}.csv", index=False)
df_merge_slices.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99679 entries, 40AcresBuilt to zorrooro
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   text                 99679 non-null  object
 1   date                 99679 non-null  object
 2   author.username      99679 non-null  object
 3   text_simply_cleaned  99679 non-null  object
dtypes: object(4)
memory usage: 3.8+ MB


In [None]:

texts = pd.DataFrame(df_merge_slices['text_simply_cleaned'].drop_duplicates())
texts = texts.rename(columns={'text_simply_cleaned': 'text'})   
texts.to_csv(f"data/texts_of_time_slices{name_suffix}.csv", index=False)
print(f"number of texts: {len(texts)}")

number of texts: 6911


## Toxicity detection

In [None]:
texts = pd.read_csv(f"data/toxicity_of_texts_of_time_slices{name_suffix}.csv")

def get_score_from_json(x):
    
    if pd.isna(x) or not x:
        return None
    s = re.search("'score': {'value': (.+?),", x)
    return float(s.group(1))

texts['toxicity'] = texts['perspective_api_results'].apply(get_score_from_json)
texts.to_csv(f"data/toxicity_of_texts_of_time_slices{name_suffix}.csv", index=False)
texts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6911 entries, 0 to 6910
Data columns (total 3 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   text                     6911 non-null   object 
 1   perspective_api_results  6887 non-null   object 
 2   toxicity                 6887 non-null   float64
dtypes: float64(1), object(2)
memory usage: 162.1+ KB


## Sentiments detection

In [None]:
# import nltk
# nltk.download('vader_lexicon')

from nltk.sentiment import SentimentIntensityAnalyzer
nltk_analyzer = SentimentIntensityAnalyzer()

example = nltk_analyzer.polarity_scores("")
sentiment_names = list(example.keys())
print(sentiment_names)

['neg', 'neu', 'pos', 'compound']


In [None]:

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=10)

# user_texts.drop(columns=sentiment_names, inplace=True)

result = texts['text'].astype(str).parallel_apply(nltk_analyzer.polarity_scores)
result = pd.DataFrame(result.tolist())

texts = pd.concat([texts, result], axis=1)
texts.to_csv(f"data/scores_of_time_slices_texts{name_suffix}.csv", index=False)
texts.info()

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=692), Label(value='0 / 692'))), HB…

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6911 entries, 0 to 6910
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   text                     6911 non-null   object 
 1   perspective_api_results  6887 non-null   object 
 2   toxicity                 6887 non-null   float64
 3   neg                      6911 non-null   float64
 4   neu                      6911 non-null   float64
 5   pos                      6911 non-null   float64
 6   compound                 6911 non-null   float64
dtypes: float64(5), object(2)
memory usage: 378.1+ KB


## Summarize daily datas

In [None]:

texts.index = texts['text']
cols = sentiment_names + ['toxicity']
score_dict = dict(texts[cols].T.items())

def mapping_texts(x):
    if x in score_dict.keys():
        return score_dict[x]
    else:
        return pd.Series({k:None for k in cols})

df_merge_slices = pd.read_csv(f"data/time_slices{name_suffix}.csv")
df_merge_slices[cols] = df_merge_slices['text_simply_cleaned'].apply(mapping_texts)
df_merge_slices.to_csv(f"data/time_slices{name_suffix}.csv", index=False)
df_merge_slices.to_csv(f"/mnt/data/shared/time_slice_data[sentiment=nltk]/time_slices{name_suffix}.csv", index=False)
df_merge_slices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99679 entries, 0 to 99678
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   text                 99679 non-null  object 
 1   date                 99679 non-null  object 
 2   author.username      99679 non-null  object 
 3   text_simply_cleaned  99679 non-null  object 
 4   neg                  99679 non-null  float64
 5   neu                  99679 non-null  float64
 6   pos                  99679 non-null  float64
 7   compound             99679 non-null  float64
 8   toxicity             99593 non-null  float64
dtypes: float64(5), object(4)
memory usage: 6.8+ MB


In [None]:
def calculate_statistics(df:pd.DataFrame, scores:'list[str]'):
    df['date'] = pd.to_datetime(df['date'])
    df_groupby_date = df.groupby('date')

    daily_user_count = df_groupby_date['author.username'].nunique()
    score_none_count = df_groupby_date[scores].apply(lambda x: x.isnull().sum())
    daily_mean = df_groupby_date[scores].mean()
    daily_median = df_groupby_date[scores].median()

 
    def mean_no_extreme(df:pd.DataFrame):
        q1 = df[scores].quantile(0.25)
        q3 = df[scores].quantile(0.75)
        iqr = q3 - q1
        df_no_extreme = df[~((df[scores] < (q1 - 1.5 * iqr)) | (df[scores] > (q3 + 1.5 * iqr)))]
        return df_no_extreme[scores].mean()
    
    daily_mean_no_extreme = df_groupby_date.apply(mean_no_extreme)


    daily_data = {
        'date': daily_user_count.index,
        'user_count': daily_user_count.values
    }
    _none_count = {score + '_none_count': score_none_count[score] for score in scores}
    _mean = {score + '_mean': daily_mean[score].values for score in scores}
    _median = {score + '_media': daily_median[score].values for score in scores}
    _mean_no_extreme = {score + '_mean_no_extreme': daily_mean_no_extreme[score].values for score in scores}


    daily_data = pd.DataFrame({**daily_data, **_none_count, **_mean, **_median, **_mean_no_extreme})

    return daily_data

In [7]:
# data = df_merge_slices
data = pd.read_csv(f"data/time_slices{name_suffix}.csv")
scores = sentiment_names + ['toxicity']
daily_statistics = calculate_statistics(data, scores=scores)
daily_statistics.to_csv(f"data/daily_statistics{name_suffix}.csv", index=False)
daily_statistics.to_csv(f"/mnt/data/shared/time_slice_data[sentiment=nltk]/daily_statistics{name_suffix}.csv", index=False)
daily_statistics.head(5)

Unnamed: 0_level_0,date,user_count,neg_none_count,neu_none_count,pos_none_count,compound_none_count,toxicity_none_count,neg_mean,neu_mean,pos_mean,compound_mean,toxicity_mean,neg_media,neu_media,pos_media,compound_media,toxicity_media,neg_mean_no_extreme,neu_mean_no_extreme,pos_mean_no_extreme,compound_mean_no_extreme,toxicity_mean_no_extreme
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2020-04-01,2020-04-01,73,0,0,0,0,0,0.273521,0.718863,0.007616,-0.794732,0.352163,0.289,0.711,0.0,-0.8317,0.338998,0.312964,0.687036,0.0,-0.842908,0.347779
2020-04-02,2020-04-02,30,0,0,0,0,0,0.209867,0.7682,0.0219,-0.707317,0.307885,0.235,0.7225,0.0355,-0.8176,0.349975,0.209867,0.760207,0.0219,-0.833765,0.307885
2020-04-03,2020-04-03,29,0,0,0,0,0,0.172897,0.821862,0.005241,-0.628976,0.281776,0.133,0.867,0.0,-0.6662,0.307163,0.150571,0.821862,0.0,-0.733588,0.303564
2020-04-04,2020-04-04,76,0,0,0,0,0,0.168592,0.78875,0.042763,-0.605432,0.317561,0.133,0.867,0.0,-0.6662,0.307163,0.168592,0.78875,0.042763,-0.605432,0.317561
2020-04-05,2020-04-05,73,0,0,0,0,0,0.240205,0.751589,0.008192,-0.808037,0.464101,0.268,0.732,0.0,-0.9075,0.548233,0.268,0.732,0.0,-0.9075,0.548233


In [8]:
daily_statistics.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 384 entries, 2020-04-01 to 2021-04-30
Data columns (total 22 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      384 non-null    datetime64[ns]
 1   user_count                384 non-null    int64         
 2   neg_none_count            384 non-null    int64         
 3   neu_none_count            384 non-null    int64         
 4   pos_none_count            384 non-null    int64         
 5   compound_none_count       384 non-null    int64         
 6   toxicity_none_count       384 non-null    int64         
 7   neg_mean                  384 non-null    float64       
 8   neu_mean                  384 non-null    float64       
 9   pos_mean                  384 non-null    float64       
 10  compound_mean             384 non-null    float64       
 11  toxicity_mean             384 non-null    float64       
 12  neg

In [9]:
# Calculate days from the earliest date to the latest
delta = daily_statistics['date'][-1] - daily_statistics['date'][0]
print(delta.days + 1)

395
