# Check processed corpus

## Check short posts in white supremacist corpus to see if express the ideology

In [None]:
# Load white supremacist corpus
import pandas as pd

path = '../tmp/white_supremacist_train_corpus.pkl'
data = pd.read_pickle(path)
data.info()

In [None]:
# View short posts across domains
from IPython.display import display

pd.set_option('display.max_colwidth', None)

for domain in data.domain.unique():
    print(domain)
    domain_data = data.query('domain==@domain')
    short = domain_data[domain_data.text.str.split().str.len() <= 10]
    display(short.sample(10))

In [None]:
# View distribution of post length overall
import plotly.express as px
pd.options.plotting.backend = 'plotly'

lengths = data.text.str.split().str.len()

In [None]:
data['length'] = lengths

In [None]:
# remove outliers
sampled = lengths.sample(int(1e6))
sampled[sampled < 200].hist()

In [None]:
s.domain.value_counts()

In [None]:
# Plot lengths across domains
s = data[data.length < 100].sample(int(1e6))
# s.hist(column='length')
px.histogram(s, x='length', color='domain', barmode='overlay')

In [None]:
# Plot lengths across non-forum domains
s = data.query('length < 100 and domain != "forum"')
# s.hist(column='length')
px.histogram(s, x='length', color='domain', barmode='overlay')

In [None]:
cdf = lengths.value_counts().sort_index().cumsum()/len(lengths)
cdf

In [None]:
px.line(cdf[:100])

In [None]:
# cdfs = {domain: data[data.domain==domain]['length'].value_counts().sort_index().cumsum()/len(data) for domain in data.domain.unique()}
# cdfs = pd.concat([pd.DataFrame(
#     {'cumsum': data[data.domain==domain]['length'].value_counts().sort_index().cumsum()/len(data), 'domain': domain}) for domain in data.domain.unique()
#                  ])
cdfs = pd.concat([pd.DataFrame(
    data[data.domain==domain]['length'].value_counts().sort_index().cumsum()/len(data[data.domain==domain])).assign(domain=domain).rename_axis('value').reset_index() for domain in data.domain.unique()
                 ])
cdfs.info()
cdfs[cdfs.value <= 100]

In [None]:
px.line(cdfs[cdfs.value <= 100], x='value', y='length', color='domain')

# Corpus stats

In [10]:
# Load data
import pandas as pd

path = '../tmp/white_supremacist_train_corpus.pkl'
# path = '../tmp/antiracist_train_corpus.pkl'
# path = '../tmp/domain_test_corpus.pkl'
ws_data = pd.read_pickle(path)
# path = '../data/corpora/white_supremacist_corpus.json'
# data = pd.read_json(path, orient='table')
# print(len(data))
# data.columns
ws_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 229305 entries, calderon2021_0 to stormfront_99972
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype              
---  ------      --------------   -----              
 0   text        229305 non-null  object             
 1   word_count  229305 non-null  int64              
 2   dataset     229305 non-null  object             
 3   source      229305 non-null  object             
 4   domain      229305 non-null  object             
 5   timestamp   229051 non-null  datetime64[ns, UTC]
 6   num_words   229305 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(4)
memory usage: 14.0+ MB


In [4]:
# Post and word counts total and per domain
# data['num_words'] = data.text.str.split().str.len()
stats = data.groupby('domain').agg({'num_words': ['count', 'sum', 'mean']})
stats.columns = ['post_count', 'word_count', 'avg_post_length']
stats

total = pd.DataFrame({'post_count': len(data), 'word_count': data.num_words.sum(), 'avg_post_length': data.num_words.mean()}, index=['total'])
total

pd.concat([stats,total])

Unnamed: 0,post_count,word_count,avg_post_length
chat,18739,425654,22.714873
forum,100000,6511295,65.11295
long-form,38036,11420987,300.267825
tweet,72530,1268597,17.490652
total,229305,19626533,85.591387


# Compare corpora

In [5]:
# Load comparison corpora
path = '../tmp/neutral_train_corpus.pkl'
neutral_data = pd.read_pickle(path)
neutral_data.info()

# Post and word counts total and per domain
# data['num_words'] = data.text.str.split().str.len()
stats = neutral_data.groupby('domain').agg({'num_words': ['count', 'sum', 'mean']})
stats.columns = ['post_count', 'word_count', 'avg_post_length']
stats

total = pd.DataFrame({'post_count': len(neutral_data), 'word_count': neutral_data.num_words.sum(), 'avg_post_length': neutral_data.num_words.mean()}, index=['total'])
total

pd.concat([stats,total])

<class 'pandas.core.frame.DataFrame'>
Index: 298412 entries, discord_match_99825 to twitter_match_89106
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype              
---  ------      --------------   -----              
 0   text        298412 non-null  object             
 1   word_count  298412 non-null  int64              
 2   dataset     298412 non-null  object             
 3   source      298412 non-null  object             
 4   domain      298412 non-null  object             
 5   timestamp   210015 non-null  datetime64[ns, UTC]
 6   num_words   298412 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(4)
memory usage: 18.2+ MB


Unnamed: 0,post_count,word_count,avg_post_length
chat,88397,466080,5.272577
forum,84271,3708288,44.004319
long-form,36639,32119081,876.636398
tweet,89105,1667438,18.713181
total,298412,37960887,127.209653


In [9]:
# Load comparison corpora
path = '../tmp/antiracist_train_corpus.pkl'
antiracist_data = pd.read_pickle(path)
antiracist_data.info()

# Post and word counts total and per domain
# data['num_words'] = data.text.str.split().str.len()
stats = antiracist_data.groupby('domain').agg({'num_words': ['count', 'sum', 'mean']})
stats.columns = ['post_count', 'word_count', 'avg_post_length']
stats

total = pd.DataFrame({'post_count': len(antiracist_data), 'word_count': antiracist_data.num_words.sum(), 'avg_post_length': antiracist_data.num_words.mean()}, index=['total'])
total

pd.concat([stats,total])

<class 'pandas.core.frame.DataFrame'>
Index: 200681 entries, reddit_antiracist_0 to medium_antiracist_929
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype              
---  ------      --------------   -----              
 0   text        200681 non-null  object             
 1   word_count  200681 non-null  int64              
 2   dataset     200681 non-null  object             
 3   source      200681 non-null  object             
 4   domain      200681 non-null  object             
 5   timestamp   200680 non-null  datetime64[ns, UTC]
 6   num_words   200681 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(4)
memory usage: 12.2+ MB


Unnamed: 0,post_count,word_count,avg_post_length
forum,90731,4002747,44.116642
long-form,38036,31965244,840.394468
tweet,71914,1224819,17.031718
total,200681,37192810,185.332991


In [14]:
data = pd.concat([ws_data, neutral_data, antiracist_data], keys=['white_supremacist', 'neutral', 'antiracist'],
                names=['corpus', 'id']).reset_index(level='corpus')
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 728398 entries, calderon2021_0 to medium_antiracist_929
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype              
---  ------      --------------   -----              
 0   corpus      728398 non-null  object             
 1   text        728398 non-null  object             
 2   word_count  728398 non-null  int64              
 3   dataset     728398 non-null  object             
 4   source      728398 non-null  object             
 5   domain      728398 non-null  object             
 6   timestamp   639746 non-null  datetime64[ns, UTC]
 7   num_words   728398 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(2), object(5)
memory usage: 50.0+ MB


In [None]:
import plotly.express as px

fig = px.box(data[data.word_count<100], x='corpus', y='word_count')
fig.update_traces(boxpoints=False)

In [None]:
pd.options.plotting.backend = 'plotly'
selected = neutral_data.query('domain=="forum"')
selected[selected.word_count<200].plot.hist(x='word_count')

In [11]:
data.dataset.value_counts()

papasavva2020       1901408
stormfront           706232
jokubausaite2020     415921
ironmarch            148003
qian2018              66294
calderon2021          25685
patriotfront          18739
pruden2022            12351
elsherief2021          4489
Name: dataset, dtype: int64

In [None]:
data.groupby(['domain', 'dataset'])['text'].count()

In [None]:
# counts = data.groupby(['domain', 'dataset'])['text'].count().reset_index()
counts = data.groupby(['domain', 'dataset'])['text'].count()
counts

In [None]:
data.groupby(['domain', 'dataset', 'source'])['text'].count()

In [None]:
# Can't figure out how to sort wihin domains
# counts.sort_values(['domain', 'dataset'], ascending=False).groupby('domain').groups()
# counts.sort_values(['domain', 'dataset'], ascending=False).groupby('domain')['text'].sum()

In [None]:
# Num of posts from each dataset (maybe should do log scale)
pd.options.plotting.backend = 'plotly'
vc = data.dataset.value_counts()
print(vc)
vc.plot.bar(text=vc.values)

In [None]:
# Num of posts from each dataset (maybe should do log scale)
pd.options.plotting.backend = 'plotly'
vc = data.dataset.value_counts()
print(vc)
vc.plot.bar(text=vc.values)

In [None]:
# Number of words
data['num_words'] = data.text.str.split().str.len()
# data.head()

word_count =  data.groupby('dataset')['num_words'].sum()
display(word_count)
word_count.sort_values(ascending=False).plot.bar().show()

# Total posts and words
print(f'{len(data)} posts')
print(f'{word_count.sum()} words')

In [None]:
import plotly.express as px

# By domain
dataset_info = pd.DataFrame([
    {'dataset': 'qian2018', 'domain': 'tweet/short propaganda'},
    {'dataset': 'elsherief2021', 'domain': 'tweet/short propaganda'},
    {'dataset': 'degibert2019', 'domain': 'forum'},
    {'dataset': 'patriotfront', 'domain': 'chat'},
    {'dataset': 'alatawi2021', 'domain': 'tweet/short propaganda'},
    {'dataset': 'adl_heatmap', 'domain': 'tweet/short propaganda'},
    {'dataset': 'ironmarch', 'domain': 'forum'},
    {'dataset': '4chan', 'domain': 'forum'},
    {'dataset': 'stormfront', 'domain': 'forum'},
    {'dataset': 'calderon2021', 'domain': 'long-form'},
    {'dataset': 'pruden2022', 'domain': 'long-form'},
]).set_index('dataset')
dataset_info

# data.join(dataset_info, on='dataset').domain.value_counts().plot.bar(
#     title="Number of posts by domain", 
#     labels=dict(index='domain', value='number of posts'),
# )

vc = data.join(dataset_info, on='dataset').domain.value_counts()
fig = px.bar(vc, title="Number of posts by domain", 
       labels=dict(index='domain', value='number of posts'),
        text = [f'{val}<br>{val/vc.sum(): .1%}' for val in vc.values],
    )
fig.update_layout(showlegend=False)

In [None]:
num_words = data.join(dataset_info, on='dataset').groupby('domain')['num_words'].sum().sort_values(ascending=False)
fig = px.bar(num_words,
    title="Number of words by domain", 
    labels=dict(index='domain', value='number of words'),
    text = [f'{val}<br>{val/num_words.sum(): .1%}' for val in num_words.values],
       # log_y=True
    )
fig.update_layout(showlegend=False)

In [None]:
# Investigate date ranges
print(len(data))
print(data.timestamp.count())

In [None]:
pd.options.plotting.backend = 'plotly'
data.timestamp.sample(int(1e6)).hist()
# data.timestamp.plot.hist()

In [None]:
start_date = pd.Timestamp(year=2000, month=1, day=1, tz='utc')
data.query('timestamp > @start_date').timestamp.sample(int(1e5)).hist()

In [None]:
# Bin or group by month
import plotly.express as px
# pd.options.plotting.backend = 'plotly'

start_date = pd.Timestamp(year=2000, month=1, day=1, tz='utc')
merged =  data.join(dataset_info, on='dataset').query('timestamp > @start_date').sample(int(1e6))
# merged.hist(column='timestamp', by='domain')
# merged.hist(column=['timestamp'])
px.histogram(merged, x='timestamp', color='domain', barmode=')

In [None]:
# Bin or group by month
px.histogram(merged, x='timestamp', color='domain', barmode='overlay')

In [None]:
import plotly.express as px

px.histogram(data.query('domain=="forum"').sample(int(1e5)), x='timestamp', color='dataset', barmode='overlay')