# Time slice analysis

python=3.7
|topic      |platform   |language   |
|-----------|-----------|-----------|
|COVID-19   |Reddit     |en         |

In [1]:
import pandas as pd
import numpy as np
import string
import re
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore')

# 输出DataFrame时显示所有的列
pd.set_option('display.max_columns', None)
# 输出DataFrame时每行显示完整的内容
pd.set_option('display.max_colwidth', None)

name_suffix = "[debunking=keywords][lang=en][topic=POTUS2016][platform=Reddit]"

## Load debunking community (debunking dataset)

This step is solely for extracting the debunking dataset; if you already have a saved debunking dataset file, there's no need to go through the cumbersome process as shown below.

In [3]:
df_debunk = pd.read_csv(f"data/debunking_comments{name_suffix}.csv")  
df_debunk.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157917 entries, 0 to 157916
Data columns (total 9 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   created_utc   157917 non-null  object
 1   author        157917 non-null  object
 2   subreddit     157917 non-null  object
 3   body          157917 non-null  object
 4   parent_id     157915 non-null  object
 5   subreddit_id  157915 non-null  object
 6   id            157915 non-null  object
 7   lang          157917 non-null  object
 8   body_cleaned  156896 non-null  object
dtypes: object(9)
memory usage: 10.8+ MB


## Group by date

In [7]:
# Convert 'created_at' column to datetime
df_debunk['created_utc'] = pd.to_datetime(df_debunk['created_utc'])
# Extract date from 'created_at' column
df_debunk['date'] = df_debunk['created_utc'].dt.date
# Group by date
df_debunk['body_cleaned'] = df_debunk['body_cleaned'].astype(str) 
grouped_df = df_debunk.groupby('date')

In [8]:
# In each time slice, aggregate texts for each user
time_slices = dict()
for name, group in grouped_df:
    time_slices[name] = group.groupby(by='author').agg(text=("body_cleaned", lambda x: ' '.join(set(x))))

In [10]:
# Convert time_slices from dict into DataFrame, then save it as csv
for k, v in time_slices.items():
    v['date'] = k

df_merge_slices = pd.concat(time_slices.values())
df_merge_slices.sort_values(by='date', inplace=True)
df_merge_slices.to_csv(f"data/time_slices{name_suffix}.csv")
len(df_merge_slices)

134326

## Toxicity detection

In [None]:
# This is just an example.
data_path = f"data/time_slices{name_suffix}.csv"
result_path = f"data/toxicity_of_time_slices{name_suffix}.csv"
cmd = f"python Perspective.py --data={data_path} --result={result_path} --max_workers=" 
print(cmd)
! {cmd} 

In [None]:
perspective_path = f"data/toxicity_of_time_slices{name_suffix}.csv"
perspective_res = pd.read_csv(perspective_path)
perspective_res.info()

def get_score_from_json(x):
    if pd.isna(x):
        return None
    s = re.search("'score': {'value': (.+?),", x)
    return float(s.group(1))

perspective_res['toxicity'] = perspective_res['perspective_api_results'].apply(get_score_from_json)
perspective_res.to_csv(perspective_path, index=False)

## Sentiments detection

In [3]:
# Read the LIWC dictionary.
import liwc
liwcPath = r'data/LIWC2015_English.dic'
parse, category_names = liwc.load_token_parser(liwcPath)

# Analyze each user using LIWC.
from sklearn.feature_extraction.text import TfidfVectorizer

def liwc_analyse_ver2(text, categories=['positive','negative','affect']):
    corpus = []
    words = []

    review = re.sub('[^a-zA-Z0-9]', ' ', text)
    review = review.split()
    review = list(category for token in review for category in parse(token))
    statements = ' '.join(review)
    corpus.append(statements)
    words.append(review)
    
    # TF-IDF
    try:
        vectorizer = TfidfVectorizer(max_features=5000)
        X_fit = vectorizer.fit(corpus)
        X_transformed = X_fit.transform(corpus)

        features = vectorizer.get_feature_names()
        df = pd.DataFrame(X_transformed.toarray(),columns=features)
        result = {col: df.get(col) for col in categories}
        result_df = pd.DataFrame(result)
    except:
        result_df = pd.DataFrame({k:[None] for k in categories})

    return result_df.T[0]

In [None]:
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=10)
selected_categories = ['positive','negative','affect']
perspective_res.loc[:, selected_categories] = perspective_res['text'].astype(str).parallel_apply(liwc_analyse_ver2)
perspective_res.to_csv(f"data/time_slices{name_suffix}.csv")

## Summarize daily datas

In [8]:
def calculate_statistics(df:pd.DataFrame):
    df['date'] = pd.to_datetime(df['date'])
    scores = ['positive', 'negative', 'affect', 'toxicity']
    df_groupby_date = df.groupby('date')

    daily_user_count = df_groupby_date['author'].nunique()
    score_none_count = df_groupby_date[scores].apply(lambda x: x.isnull().sum())
    daily_mean = df_groupby_date[scores].mean()
    daily_median = df_groupby_date[scores].median()

    def mean_no_extreme(df:pd.DataFrame):
        q1 = df[scores].quantile(0.25)
        q3 = df[scores].quantile(0.75)
        iqr = q3 - q1
        df_no_extreme = df[~((df[scores] < (q1 - 1.5 * iqr)) | (df[scores] > (q3 + 1.5 * iqr)))]
        return df_no_extreme[scores].mean()
    
    daily_mean_no_extreme = df_groupby_date.apply(mean_no_extreme)

    daily_data = pd.DataFrame({
        'date': daily_user_count.index,
        'user_count': daily_user_count.values,
        'positive_none_count': score_none_count['positive'],
        'negative_none_count': score_none_count['negative'],
        'affect_none_count': score_none_count['affect'],
        'toxicity_none_count': score_none_count['toxicity'],
        'positive_mean': daily_mean['positive'].values,
        'negative_mean': daily_mean['negative'].values,
        'affect_mean': daily_mean['affect'].values,
        'toxicity_mean': daily_mean['toxicity'].values,
        'positive_median': daily_median['positive'].values,
        'negative_median': daily_median['negative'].values,
        'affect_median': daily_median['affect'].values,
        'toxicity_median': daily_median['toxicity'].values,
        'positive_mean_no_extreme': daily_mean_no_extreme['positive'].values,
        'negative_mean_no_extreme': daily_mean_no_extreme['negative'].values,
        'affect_mean_no_extreme': daily_mean_no_extreme['affect'].values,
        'toxicity_mean_no_extreme': daily_mean_no_extreme['toxicity'].values,
    })

    return daily_data

In [9]:
data = pd.read_csv("data/time_slices[topic=COVID19][platform=Reddit][lang=en][debunking=keywords].csv")
daily_statistics = calculate_statistics(data)
daily_statistics.to_csv("data/daily_statistics[topic=COVID19][platform=Reddit][lang=en][debunking=keywords].csv", index=False)
daily_statistics.head(5)

Unnamed: 0_level_0,date,user_count,positive_none_count,negative_none_count,affect_none_count,toxicity_none_count,positive_mean,negative_mean,affect_mean,toxicity_mean,positive_median,negative_median,affect_median,toxicity_median,positive_mean_no_extreme,negative_mean_no_extreme,affect_mean_no_extreme,toxicity_mean_no_extreme
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-03-01,2020-03-01,181,40,33,12,1,0.063185,0.091573,0.268284,0.29149,0.053529,0.081781,0.247226,0.23776,0.056634,0.084943,0.258914,0.244027
2020-03-02,2020-03-02,259,80,46,26,0,0.065183,0.08439,0.256636,0.26162,0.055258,0.069285,0.239808,0.207059,0.061969,0.079073,0.246584,0.22108
2020-03-03,2020-03-03,313,85,61,32,2,0.070109,0.085433,0.269941,0.273372,0.064541,0.074881,0.261262,0.205721,0.064323,0.080113,0.268339,0.240581
2020-03-04,2020-03-04,294,69,62,26,1,0.070976,0.082456,0.263715,0.247207,0.064167,0.072071,0.24667,0.198934,0.06637,0.074573,0.257765,0.206991
2020-03-05,2020-03-05,267,76,66,36,5,0.063178,0.083012,0.250254,0.244104,0.052007,0.076492,0.240719,0.188392,0.057321,0.078387,0.247099,0.215128


In [10]:
daily_statistics.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 731 entries, 2020-03-01 to 2022-03-01
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      731 non-null    datetime64[ns]
 1   user_count                731 non-null    int64         
 2   positive_none_count       731 non-null    int64         
 3   negative_none_count       731 non-null    int64         
 4   affect_none_count         731 non-null    int64         
 5   toxicity_none_count       731 non-null    int64         
 6   positive_mean             731 non-null    float64       
 7   negative_mean             731 non-null    float64       
 8   affect_mean               731 non-null    float64       
 9   toxicity_mean             731 non-null    float64       
 10  positive_median           731 non-null    float64       
 11  negative_median           731 non-null    float64       
 12  aff

In [11]:
# Calculate days from the earliest date to the latest
delta = daily_statistics['date'][-1] - daily_statistics['date'][0]
print(delta.days + 1)

731
