# Find distribution of cel/pill variants

In [2]:
# Load processed data (to get usernames)
import pandas as pd

path = '../../data/incels/processed_comments.pkl'
data = pd.read_pickle(path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6248230 entries, 0 to 6248229
Data columns (total 11 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   type                              object        
 1   forum                             object        
 2   thread                            object        
 3   username                          object        
 4   date                              object        
 5   content                           object        
 6   parsed_date                       datetime64[ns]
 7   content_orig                      object        
 8   netmapper_identity_matches        object        
 9   netmapper_identity_matches_spans  object        
 10  actions_attributes                object        
dtypes: datetime64[ns](1), object(10)
memory usage: 524.4+ MB


In [3]:
# Get all usernames
usernames = data.username.str.lower().unique().tolist()
len(usernames)

8467

In [5]:
# Load incels vocab and term counts from Gensim model
from gensim.models import Word2Vec

path = '../models/emb/incels.model'
model = Word2Vec.load(path)

In [4]:
import json

with open('../resources/cel_exclude.json', 'r') as f:
    extra = json.load(f)
len(extra)

62

In [16]:
# cel varieties

from collections import Counter
import pandas as pd

exclude = usernames + [f'@{name}' for name in usernames] + extra
cel_variants = Counter({wd: model.wv.get_vecattr(wd, 'count') for wd in model.wv.key_to_index.keys() if 'cel' in wd and not wd in exclude})
combos = {'fakecel/s': ['fakecel', 'fakecels'],
          'truecel/s': ['truecel', 'truecels', 'trucel', 'trucels'],
          'volcel/s': ['volcel', 'volcels'],
          'ricecel/s': ['ricecel', 'ricecels'],
          'greycel/s': ['graycel', 'greycel', 'graycels', 'greycels'],
          'mentalcel/s': ['mentalcel', 'mentalcels'],
          'escortcel/ling': ['escortcel', 'escortcels', 'escortcelling', 'escortceling'],
          'framecel/s': ['framecel', 'framecels'],
          'fatcel/s': ['fatcel', 'fatcels'],
          'oldcel/s': ['oldcel', 'oldcels'],
          'blackcel/s': ['blackcel', 'blackcels'],
          'gymcel/ling': ['gymcel', 'gymcels', 'gymcelling', 'gymceling'],
          'brocel/s': ['brocel', 'brocels'],
          'ethnicel/s': ['ethnicel', 'ethnicels', 'ethniccels', 'ethniccel'],
          'currycel/s': ['currycel', 'currycels', 'curriecel', 'curriecels'],
          'femcel/s': ['femcel', 'femcels'],
          'whitecel/s': ['whitecel', 'whitecels'],
          'youngcel/s': ['youngcel', 'youngcels'],
          'itcel/s': ['itcel', 'itcels'],
          'nearcel/s': ['nearcel', 'nearcels'],
         }
# combo_variants = cel_variants.copy()
for combo in combos:
    count = 0
    for term in combos[combo]:
        count += cel_variants[term]
        del cel_variants[term]
    cel_variants[combo] = count
cel_variants = pd.DataFrame(cel_variants.most_common(40), columns=['term', 'count'])
cel_variants
# Should combine plural variants (go all possible pairs of top terms, then combine into fakecel/s and escortcel/ling and remove the singular variants)

Unnamed: 0,term,count
0,truecel/s,57619
1,fakecel/s,52716
2,volcel/s,35784
3,greycel/s,23175
4,escortcel/ling,14846
5,gymcel/ling,12338
6,ricecel/s,11437
7,mentalcel/s,5868
8,currycel/s,5366
9,fatcel/s,5152


In [None]:
import plotly.express as px

fig = px.bar(cel_variants, x='term', y='count')
fig.update_xaxes(tickangle=45)

In [17]:
# Table for the paper
', '.join(cel_variants.loc[:20]['term'].str.replace('/s', 's').str.replace('l/l', 'll'))

'truecels, fakecels, volcels, greycels, escortcelling, gymcelling, ricecels, mentalcels, currycels, fatcels, femcels, whitecels, framecels, youngcels, oldcels, blackcels, ethnicels, brocels, itcels, incelistan, nearcels'

In [22]:
# Table for the paper (old, with counts)
from IPython.display import display

cel_variants = cel_variants.loc[:20]
# cel_variants.columns = cel_variants.columns.str.capitalize()
cel_variants.columns = ['\\textbf{Term}', '\\textbf{Count}']
display(cel_variants)
# print(cel_variants.style.applymap_index(lambda v: "font-weight: bold;", axis="columns").hide(axis='index').to_latex(hrules=True, convert_css=True))
print(cel_variants.style.hide(axis='index').to_latex(hrules=True, convert_css=True))

Unnamed: 0,\textbf{Term},\textbf{Count}
0,fakecel,36490
1,volcel,35784
2,truecel,29301
3,truecels,20009
4,fakecels,16226
5,ricecels,11437
6,greycel,11016
7,gymcelling,7173
8,trucel,6063
9,graycel,6041


\begin{tabular}{lr}
\toprule
\textbf{Term} & \textbf{Count} \\
\midrule
fakecel & 36490 \\
volcel & 35784 \\
truecel & 29301 \\
truecels & 20009 \\
fakecels & 16226 \\
ricecels & 11437 \\
greycel & 11016 \\
gymcelling & 7173 \\
trucel & 6063 \\
graycel & 6041 \\
mentalcels & 5868 \\
escortcel & 5545 \\
currycels & 5366 \\
femcels & 4901 \\
whitecels & 4745 \\
greycels & 4567 \\
youngcels & 4280 \\
escortcelling & 4190 \\
framecel & 3960 \\
escortcels & 3550 \\
fatcel & 3502 \\
\bottomrule
\end{tabular}



In [None]:
# pill varieties

from collections import Counter
import pandas as pd

exclude = usernames + [f'@{name}' for name in usernames] + [
    'pilled', 'pills', 'pillers', 'spill', 'theredpill', 'piller', 'pilling', 'spilling', 'spilled', 'pillars', 'pillar', 'horsepill', 'horsepills', 'pillage', 'pillages', 'spillage',
    'pillows', 'pillow', 'blackpillpres'
]
pill_variants = Counter({wd: model.wv.get_vecattr(wd, 'count') for wd in model.wv.key_to_index.keys() if 'pill' in wd and not wd in exclude})
pill_variants = pd.DataFrame(pill_variants.most_common(40), columns=['term', 'count'])
pill_variants

import plotly.express as px

fig = px.bar(pill_variants, x='term', y='count')
fig.update_xaxes(tickangle=45)

In [None]:
# Look into use of these terms
pd.set_option('display.max_colwidth', None)
# terms = ['fakecel', 'fakecels', 'fake cel', 'fake cels',] 
# terms = ['truecel', 'truecels', 'true cel', 'true cels', 'trucel', 'trucels'] 
# terms = ['suicel', 'suicels']
term = 'baldcel'

data.loc[data.content.str.contains(rf'\b{term}\b'), ['content', 'username']].sample(10)
# data.loc[data.content.str.contains(r'|'.join([rf'\b{term}\b' for term in terms])), ['content', 'username']].sample(10)