# Time slice analysis

python=3.7
|topic      |platform   |language   |
|-----------|-----------|-----------|
|QAnon      |Reddit     |en         |

In [None]:
import pandas as pd
import numpy as np
import string
import re
from joblib import dump, load
import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Load debunking community (debunking dataset)

In [2]:
df_debunk = pd.read_csv("data/debunking_comments[keyword=QAnon][lang=en].csv")
df_debunk.shape

(14637, 8)

## Text Cleaning

In [None]:

def wordopt(text):
    text = text.lower()
    text = re.sub(r'\\n', '', text) 
    text = re.sub('\[.*?\]', '', text) 
    text = re.sub('https?://\S+|www\.\S+', '', text) 
    text = re.sub("\\W"," ",text) 
    text = re.sub('<.*?>+', '', text) 
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) 
    text = re.sub('\n', '', text) 
    text = re.sub('\w*\d\w*', '', text) 
    return text


import nltk
# nltk.download('stopwords')
# nltk.download('punkt')
eng_stopwords = nltk.corpus.stopwords.words("english")
def remove_eng_stopwords(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [word for word in token_text if word not in eng_stopwords]
    join_text = ' '.join(remove_stop)
    return join_text


# nltk.download('wordnet')
# nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
lemm = WordNetLemmatizer()
def word_lemmatizer(text):
    token_text = nltk.word_tokenize(text)
    remove_stop = [lemm.lemmatize(w) for w in token_text]
    join_text = ' '.join(remove_stop)
    return join_text


from nltk.corpus import stopwords
Word_STOPWORDS = ["e", "te", "i", "me", "qe", "ne", "nje", "a", "per", "sh", "nga", "ka", "u", "eshte", "dhe", "shih", "nuk",
             "m", "dicka", "ose", "si", "shume", "etj", "se", "pa", "sipas", "s", "t", "dikujt", "dike", "mire", "vet",
             "bej", "ai", "vend", "prej", "ja", "duke", "tjeter", "kur", "ia", "ku", "ta", "keq", "dy", "ben", "bere",
             "behet", "dickaje", "edhe", "madhe", "la", "sa", "gjate", "zakonisht", "pas", "veta", "mbi", "disa", "iu",
             "mos", "c", "para", "dikush", "gje", "be", "pak", "tek", "fare", "beri", "po", "bie", "k", "do", "gjithe",
             "vete", "mund", "kam", "le", "jo", "beje", "tij", "kane", "ishte", "jane", "vjen", "ate", "kete", "neper",
             "cdo", "na", "marre", "merr", "mori", "rri", "deri", "b", "kishte", "mban", "perpara", "tyre", "marr",
             "gjitha", "as", "vetem", "nen", "here", "tjera", "tjeret", "drejt", "qenet", "ndonje", "nese", "jap",
             "merret", "rreth", "lloj", "dot", "saj", "nder", "ndersa", "cila", "veten", "ma", "ndaj", "mes", "ajo",
             "cilen", "por", "ndermjet", "prapa", "mi", "tere", "jam", "ashtu", "kesaj", "tille", "behem", "cilat",
             "kjo", "menjehere", "ca", "je", "aq", "aty", "prane", "ato", "pasur", "qene", "cilin", "teper", "njera",
             "tej", "krejt", "kush", "bejne", "ti", "bene", "midis", "cili", "ende", "keto", "kemi", "sic", "kryer",
             "cilit", "atij", "gjithnje", "andej", "siper", "sikur", "ketej", "ciles", "ky", "papritur", "ua",
             "kryesisht", "gjithcka", "pasi", "kryhet", "mjaft", "ketij", "perbashket", "ata", "atje", "vazhdimisht",
             "kurre", "tone", "keshtu", "une", "sapo", "rralle", "vetes", "ishin", "afert", "tjetren", "ketu", "cfare",
             "to", "anes", "jemi", "asaj", "secila", "kundrejt", "ketyre", "pse", "tilla", "mua", "nepermjet", "cilet",
             "ndryshe", "kishin", "ju", "tani", "atyre", "dic", "yne", "kudo", "sone", "sepse", "cilave", "kem", "ty",
             "t'i", "nbsp", "tha", "re", "the", "jr", "t", "n"]
stop = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)
text_unknows= Word_STOPWORDS
stop.update(text_unknows)



from bs4 import BeautifulSoup
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_between_square_brackets(text):
    '''Removing the square brackets'''
    return re.sub('\[[^]]*\]', '', text)

def remove_between_square_brackets(text):
    '''Removing URL's'''
    return re.sub(r'http\S+', '', text)

def remove_stopwords(text):
    '''Removing the stopwords from text'''
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

def denoise_text(text):
    '''Removing the noisy text'''
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_stopwords(text)
    return text


def punctuation_removal(text):
    all_list = [char for char in text if char not in string.punctuation]
    clean_str = ''.join(all_list)
    return clean_str

In [4]:
def text_cleaning(text):
    text = wordopt(text)
    text = remove_eng_stopwords(text)
    text = word_lemmatizer(text)
    text = denoise_text(text)
    text = punctuation_removal(text)
    return text

df_txt = df_debunk
df_txt['body'] = df_txt['body'].apply(text_cleaning)

## Group by date

In [7]:
# Convert 'created_at' column to datetime
df_txt['created_utc'] = pd.to_datetime(df_txt['created_utc'])
# Extract date from 'created_at' column
df_txt['date'] = df_txt['created_utc'].dt.date
# Group by date
grouped_df = df_txt.groupby('date')

In [9]:
# In each time slice, aggregate texts for each user
time_slices = dict()
for name, group in grouped_df:
    time_slices[name] = group.groupby(by='author').agg(text=("body", lambda x: ' '.join(set(x))))

In [10]:
# Convert time_slices from dict into DataFrame, then save it as csv
for k, v in time_slices.items():
    v['date'] = k

df_merge_slices = pd.concat(time_slices.values())
df_merge_slices.sort_values(by='date', inplace=True)
df_merge_slices.to_csv("data/time_slices[topic=QAnon][platform=Reddit][lang=en][debunking=keywords].csv")

## Toxicity detection

In [None]:
perspective_path = "data/toxicity_of_time_slices[topic=QAnon][platform=Reddit][lang=en][debunking=keywords].csv"
perspective_res = pd.read_csv(perspective_path)
perspective_res.info()

def get_score_from_json(x):
   
    if pd.isna(x):
        return None
    s = re.search("'score': {'value': (.+?),", x)
    return float(s.group(1))

perspective_res['toxicity'] = perspective_res['perspective_api_results'].apply(get_score_from_json)
perspective_res.to_csv(perspective_path, index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13514 entries, 0 to 13513
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   author                   13514 non-null  object
 1   text                     13482 non-null  object
 2   date                     13514 non-null  object
 3   perspective_api_results  13455 non-null  object
dtypes: object(4)
memory usage: 422.4+ KB


## Sentiments detection

In [None]:

import liwc
liwcPath = r'data/LIWC2015_English.dic'
parse, category_names = liwc.load_token_parser(liwcPath)


from sklearn.feature_extraction.text import TfidfVectorizer

def liwc_analyse_ver2(text, categories=['positive','negative','affect']):
    corpus = []
    words = []

    review = re.sub('[^a-zA-Z0-9]', ' ', text)
    review = review.split()
    review = list(category for token in review for category in parse(token))
    statements = ' '.join(review)
    corpus.append(statements)
    words.append(review)
    
    # TF-IDF
    try:
        vectorizer = TfidfVectorizer(max_features=5000)
        X_fit = vectorizer.fit(corpus)
        X_transformed = X_fit.transform(corpus)

        features = vectorizer.get_feature_names()
        df = pd.DataFrame(X_transformed.toarray(),columns=features)
        result = {col: df.get(col) for col in categories}
        result_df = pd.DataFrame(result)
    except:
        result_df = pd.DataFrame({k:[None] for k in categories})

    return result_df.T[0]

In [None]:

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=10)
selected_categories = ['positive','negative','affect']
perspective_res.loc[:, selected_categories] = perspective_res['text'].astype(str).parallel_apply(liwc_analyse_ver2)
perspective_res.to_csv("data/time_slices[topic=QAnon][platform=Reddit][lang=en][debunking=keywords].csv")

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1352), Label(value='0 / 1352'))), …

## Summarize daily datas

In [None]:
def calculate_statistics(df:pd.DataFrame):
    
    df['date'] = pd.to_datetime(df['date'])
    scores = ['positive', 'negative', 'affect', 'toxicity']

    df_groupby_date = df.groupby('date')


    daily_user_count = df_groupby_date['author'].nunique()

    score_none_count = df_groupby_date[scores].apply(lambda x: x.isnull().sum())

    daily_mean = df_groupby_date[scores].mean()
    daily_median = df_groupby_date[scores].median()

 
    def mean_no_extreme(df:pd.DataFrame):
        q1 = df[scores].quantile(0.25)
        q3 = df[scores].quantile(0.75)
        iqr = q3 - q1
        df_no_extreme = df[~((df[scores] < (q1 - 1.5 * iqr)) | (df[scores] > (q3 + 1.5 * iqr)))]
        return df_no_extreme[scores].mean()
    
    daily_mean_no_extreme = df_groupby_date.apply(mean_no_extreme)

 
    daily_data = pd.DataFrame({
        'date': daily_user_count.index,
        'user_count': daily_user_count.values,
        'positive_none_count': score_none_count['positive'],
        'negative_none_count': score_none_count['negative'],
        'affect_none_count': score_none_count['affect'],
        'toxicity_none_count': score_none_count['toxicity'],
        'positive_mean': daily_mean['positive'].values,
        'negative_mean': daily_mean['negative'].values,
        'affect_mean': daily_mean['affect'].values,
        'toxicity_mean': daily_mean['toxicity'].values,
        'positive_median': daily_median['positive'].values,
        'negative_median': daily_median['negative'].values,
        'affect_median': daily_median['affect'].values,
        'toxicity_median': daily_median['toxicity'].values,
        'positive_mean_no_extreme': daily_mean_no_extreme['positive'].values,
        'negative_mean_no_extreme': daily_mean_no_extreme['negative'].values,
        'affect_mean_no_extreme': daily_mean_no_extreme['affect'].values,
        'toxicity_mean_no_extreme': daily_mean_no_extreme['toxicity'].values,
    })

    return daily_data

In [8]:
data = pd.read_csv("data/time_slices[topic=QAnon][platform=Reddit][lang=en][debunking=keywords].csv")
daily_statistics = calculate_statistics(data)
daily_statistics.to_csv("data/daily_statistics[topic=QAnon][platform=Reddit][lang=en][debunking=keywords].csv", index=False)
daily_statistics.head(5)

Unnamed: 0_level_0,date,user_count,positive_none_count,negative_none_count,affect_none_count,toxicity_none_count,positive_mean,negative_mean,affect_mean,toxicity_mean,positive_median,negative_median,affect_median,toxicity_median,positive_mean_no_extreme,negative_mean_no_extreme,affect_mean_no_extreme,toxicity_mean_no_extreme
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-04-01,2020-04-01,3,1,0,0,0,0.042459,0.099086,0.265188,0.568845,0.042459,0.041772,0.226762,0.698991,0.042459,0.099086,0.265188,0.568845
2020-04-02,2020-04-02,8,2,3,1,0,0.122699,0.085547,0.332551,0.304634,0.090224,0.100261,0.280731,0.284514,0.122699,0.085547,0.332551,0.226146
2020-04-03,2020-04-03,7,0,3,0,0,0.07781,0.076313,0.243922,0.269872,0.074261,0.080224,0.235702,0.282002,0.07781,0.076313,0.243922,0.169905
2020-04-04,2020-04-04,13,2,2,0,0,0.071862,0.081377,0.261261,0.35119,0.064778,0.053082,0.265408,0.375766,0.071862,0.064317,0.241037,0.35119
2020-04-05,2020-04-05,6,1,0,0,0,0.053813,0.077854,0.248516,0.378525,0.04679,0.07431,0.217437,0.412433,0.053813,0.077854,0.248516,0.378525


In [9]:
daily_statistics.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 396 entries, 2020-04-01 to 2021-05-01
Data columns (total 18 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   date                      396 non-null    datetime64[ns]
 1   user_count                396 non-null    int64         
 2   positive_none_count       396 non-null    int64         
 3   negative_none_count       396 non-null    int64         
 4   affect_none_count         396 non-null    int64         
 5   toxicity_none_count       396 non-null    int64         
 6   positive_mean             396 non-null    float64       
 7   negative_mean             396 non-null    float64       
 8   affect_mean               396 non-null    float64       
 9   toxicity_mean             396 non-null    float64       
 10  positive_median           396 non-null    float64       
 11  negative_median           396 non-null    float64       
 12  aff

In [10]:
# Calculate days from the earliest date to the latest
delta = daily_statistics['date'][-1] - daily_statistics['date'][0]
print(delta.days + 1)

396
