Find central users from the user x user (shared thread) network and characterize their behavior

# Load central users

In [1]:
# Load central users (calculated from ORA)
import pandas as pd

path = '../output/incels_is_centrality_measures.csv'
central = pd.read_csv(path, index_col=0).drop(columns=['Input networks', 'Input nodesets', 'Input parameters']).transpose()
central.info()

central.head()

<class 'pandas.core.frame.DataFrame'>
Index: 6819 entries, Transcended Trucel to rot099
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   eigenvectorCentrality  6819 non-null   float64
 1   totalDegreeCentrality  6819 non-null   float64
dtypes: float64(2)
memory usage: 159.8+ KB


Measure,eigenvectorCentrality,totalDegreeCentrality
Transcended Trucel,0.000396,2.79722e-06
ItsNotADream,9e-06,1.959646e-07
AAAAAAAAAAAcel,0.107865,0.0001865163
Amerihiki,2e-06,6.461526e-08
Deleted member 7448,2e-05,8.474116e-07


In [4]:
print(len(central))
top = int(len(central)/20)
print(top)
highest_eigen = central.sort_values('eigenvectorCentrality', ascending=False).iloc[:top]
highest_eigen.head(10)

highest_eigen.tail(2)

6819
340


Measure,eigenvectorCentrality,totalDegreeCentrality
Liszt,0.000135,1.965215e-06
squirrelsonfire2,0.000133,9.35318e-07


In [5]:
rest = central[~central.index.isin(highest_eigen.index)]
len(rest)

6479

In [None]:
# Graph distributions of centrality measures
import plotly.express as px

selected = central.sort_values('totalDegreeCentrality', ascending=False).head(100)
px.bar(selected, x=selected.index, y='totalDegreeCentrality')

In [None]:
# Graph distributions of centrality measures
import plotly.express as px

selected = central.sort_values('eigenvectorCentrality', ascending=False).head(100)
px.bar(selected, x=selected.index, y='eigenvectorCentrality')

# Compare cel variants in usernames

In [15]:
# Extract cel variants from top users vs others
topcel = [name for name in highest_eigen.index if 'cel' in name]
print(f'{len(topcel)} ({len(topcel)/len(highest_eigen): .2%}) cel names in top')
bottomcel = [name for name in rest.index if 'cel' in name]
print(f'{len(bottomcel)} ({len(bottomcel)/len(rest): .2%}) cel names in bottom')

54 ( 15.88%) cel names in top
1048 ( 16.18%) cel names in bottom


In [16]:
topcel

['Amphetaminecel',
 'Cowcel',
 'SillyTruecel',
 'AAAAAAAAAAAcel',
 'Animecel2D',
 'Based-nearcel',
 'Ghoulcel',
 'Diocel',
 'Zettacel',
 'Incellectual',
 'Lookscel',
 'ItsOver4cel',
 'soymonkcel',
 'Sadandangrycel',
 'Damo the incel',
 'Ritalincel',
 'Idlevillagercel',
 'Cheesecel',
 'Caesercel',
 'Uglychincel',
 'JosefMengelecel',
 'PPEcel',
 'Tiredpoorcel',
 'angrycurrycel',
 'croatincel',
 'SuperSaiyanGymcel',
 'Daydreamincel',
 'gymletethnicel',
 'Templarcel421',
 'BraincelsRefugee',
 'SergeantIncel',
 'JohnDcel',
 'turbocuckcel_7000',
 'DominicanDancecel91',
 'gigacel123',
 'Gyros_Pretcel',
 'Transcended Trucel',
 'speedtypingincel',
 'HighTGymcel',
 'IncelCream',
 'Legendarywristcel',
 'singleplayercel',
 'stuttercel',
 'kikecel',
 'littlemanhikicel',
 'SkinnyBaldcel',
 'lonelycel69',
 'EthnicelNL',
 'Gymcelled',
 'rightfulcel',
 'crestfallencel',
 'carticel',
 'Bleachcel',
 'Blackcel rigth wing']

In [17]:
bottomcel[:50]

['ChronicPaincel',
 'Blackpincel',
 'Wizcel',
 'dirtykombatcel',
 'Massimo The Lonecel',
 'Chileancel',
 'Rambocel',
 'Mentally lost cel',
 'Hyperwristcel',
 'Arabcel9',
 'MScel',
 'Legallyblindcel',
 'guaucel',
 'Wagiecel',
 'TheIncredibleIncel',
 'ghettocel',
 'Cafecel',
 'TheUltimateMarkcel',
 'Mulattocel',
 'codingcel',
 'Currycel25',
 'Incellio',
 'Mountainbikecel',
 'ThirdWorldcel',
 'Arrogantcel',
 'Philosophycel',
 'andinocel',
 'acnescarcel',
 'Jockcel',
 'UKhapacel',
 'facepulling_incel',
 'DBcel',
 'startcel',
 'Timecel',
 'Limerencel',
 'Gremlincel',
 'Bagelcel',
 'CopingGymcel',
 'presidencel',
 'TheMostAncientcel',
 'DENSA_IQcel',
 'toyotacel',
 'Quasimodocel',
 'Subhuman Currycel',
 'fuckupcel',
 'GermaniaIncelia',
 'Greyandoldcel',
 'Eschewcel',
 'eurocel',
 'TheRealChincel']

# Compare identity group use

In [2]:
# Load data with extracted actions and attributes
import pandas as pd

path = '../../data/incels/processed_comments.pkl'
data = pd.read_pickle(path)
data.info()

exp = data.explode('netmapper_identity_matches')
# exp.info()

# Group identities
import json

identity_groups_fpath = '../resources/identity_groups.json'
with open(identity_groups_fpath, 'r') as f:
    identity_groups = json.load(f)
print(len(identity_groups))

exp['identity_group'] = exp.netmapper_identity_matches.map(lambda x: identity_groups.get(x, x))
exploded = exp.explode('identity_group') # Count intersectional mentions as a mention in each of their categories
exploded.info()

# samp = exploded.sample(int(1e6))
# gped = samp.groupby('identity_group')
gped = exploded.groupby('identity_group')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6248230 entries, 0 to 6248229
Data columns (total 11 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   type                              object        
 1   forum                             object        
 2   thread                            object        
 3   username                          object        
 4   date                              object        
 5   content                           object        
 6   parsed_date                       datetime64[ns]
 7   content_orig                      object        
 8   netmapper_identity_matches        object        
 9   netmapper_identity_matches_spans  object        
 10  actions_attributes                object        
dtypes: datetime64[ns](1), object(10)
memory usage: 524.4+ MB
513
<class 'pandas.core.frame.DataFrame'>
Int64Index: 13578886 entries, 0 to 6248229
Data columns (total 12 columns):
 #   

In [6]:
top_data = data[data.username.isin(highest_eigen.index)]
rest_data = data[data.username.isin(rest.index)]
print(len(top_data))
print(len(rest_data))

top_exploded = exploded[exploded['username'].isin(highest_eigen.index)]
len(top_exploded)

rest_exploded = exploded[exploded['username'].isin(rest.index)]
len(rest_exploded)

3052056
3184199


7850586

## Plot for paper

In [None]:
gp_counts = gped['content'].count().sort_values(ascending=False)
selected_gps = [gp for gp in gp_counts.index[:12].tolist() if gp not in ['women_girls_derogatory', 'men_boys_address', 'youth']] + ['truecels', 'fakecels']
selected_gps

# Distributions of identity group mentions
top_counts = top_exploded.groupby('identity_group')['content'].count()
rest_counts = rest_exploded.groupby('identity_group')['content'].count()
# top_counts = top_counts[top_counts.index.isin(selected_gps)]/len(top_data)
# rest_counts = rest_counts[rest_counts.index.isin(selected_gps)]/len(rest_data)
top_counts = top_counts[top_counts.index.isin(selected_gps)]/top_counts.sum()
rest_counts = rest_counts[rest_counts.index.isin(selected_gps)]/rest_counts.sum()
top_counts

counts = top_counts.to_frame(name='top')
counts['rest'] = rest_counts
# long = counts.reset_index().melt(id_vars='identity_group', value_vars=['top', 'rest'], var_name='user_group', value_name='mentions_per_post')
long = counts.reset_index().melt(id_vars='identity_group', value_vars=['top', 'rest'], var_name='user_group', value_name='proportion_mentions')
long

import numpy as np

long['log_proportion_mentions'] = long['proportion_mentions'].map(lambda x: np.log(x))
long

custom_order = dict(reversed(el) for el in enumerate([
    'women_girls', 'men_boys', 'asian_people', 'black_people', 'white_people', 'jews', 'lgbtq_people', 
    'mental_disabilities', 'incels', 'truecels', 'fakecels']))
long = long.sort_values(['identity_group'], key=lambda x: x.map(custom_order))
column_map = {'identity_group': 'Identity group', 'user_group': 'User group', 'proportion_mentions': 'Proportion of mentions',
             'log_proportion_mentions': 'Log proportion of mentions'}
formatted = long.rename(columns=column_map).replace({
    'women_girls': 'Women',
    'men_boys': 'Men',
    'youth': 'Youth',
    'mental_disabilities': 'Neurodiverse',
    'lgbtq_people': 'LGBTQ+',
    'asian_people': 'Asian',
    'black_people': 'Black',
    'white_people': 'White',
    'jews': 'Jews',
    'incels': 'Incels',
    'truecels': 'Truecels',
    'fakecels': 'Fakecels',
})
formatted

import plotly.express as px

fig = px.bar(formatted, x=column_map['identity_group'], y=column_map['proportion_mentions'], color=column_map['user_group'], barmode='group',
            log_y=True, width=600, height=400)
# fig = px.bar(formatted, x=column_map['identity_group'], y=column_map['log_proportion_mentions'], color=column_map['user_group'], barmode='group')
fig.update_layout(xaxis_title=None)
fig.write_image('../output/top5percent_eigen_identity_group_mentions.pdf')
fig.show()

# Compare text with PMI
Using top 5% eigenvector centrality as a label

In [56]:
highest_eigen.index

Index(['nihility', 'unsettling', 'Amphetaminecel', 'mNFwTJ3wz9', 'Cowcel',
       'Idotms', 'SillyTruecel', 'schrodingercoper', 'Deleted member 29001',
       'AAAAAAAAAAAcel',
       ...
       'Lolimancer', 'seija', 'System Restore', 'ReconElement',
       'tooth monster', 'Reprobus', 'Anger', 'heroinfather', 'Liszt',
       'squirrelsonfire2'],
      dtype='object', length=340)

In [60]:
# Load data with extracted actions and attributes
data['top5percent_eigen_user'] = data.username.isin(highest_eigen.index)
data.top5percent_eigen_user.value_counts().to_dict()

{False: 3196174, True: 3052056}

In [97]:
data.drop(columns=['top5_percent_eigen_user'], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6248230 entries, 0 to 6248229
Data columns (total 12 columns):
 #   Column                            Dtype         
---  ------                            -----         
 0   type                              object        
 1   forum                             object        
 2   thread                            object        
 3   username                          object        
 4   date                              object        
 5   content                           object        
 6   parsed_date                       datetime64[ns]
 7   content_orig                      object        
 8   netmapper_identity_matches        object        
 9   netmapper_identity_matches_spans  object        
 10  actions_attributes                object        
 11  top5percent_eigen_user            bool          
dtypes: bool(1), datetime64[ns](1), object(10)
memory usage: 530.3+ MB


In [99]:
# Save out for STM analysis
outpath = '../../data/incels/processed_comments_user_info.jsonl'
data[['username', 'top5percent_eigen_user', 'parsed_date', 'content']].to_json(outpath, orient='records', lines=True)

In [51]:
# Count needed totals
from collections import Counter
from tqdm.auto import tqdm
tqdm.pandas()

cooccurrences = {} # relation: {(user_group, word): n_times_co-occurs, ...}
word_freqs = {} # relation: {word: n_times_occurs_anywhere}
total_combinations = Counter() # total # label-word matches (total words)

# First pass to get total word counts
freqs = Counter()
data.content.str.split().progress_apply(freqs.update)

  0%|          | 0/6248230 [00:00<?, ?it/s]

0          None
1          None
2          None
3          None
4          None
           ... 
6248225    None
6248226    None
6248227    None
6248228    None
6248229    None
Name: content, Length: 6248230, dtype: object

In [76]:
import string
import nltk

stops = list(string.punctuation) + nltk.corpus.stopwords.words('english')
stops

['!',
 '"',
 '#',
 '$',
 '%',
 '&',
 "'",
 '(',
 ')',
 '*',
 '+',
 ',',
 '-',
 '.',
 '/',
 ':',
 ';',
 '<',
 '=',
 '>',
 '?',
 '@',
 '[',
 '\\',
 ']',
 '^',
 '_',
 '`',
 '{',
 '|',
 '}',
 '~',
 'i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from

In [80]:
# Filter by 
# freq_threshold = 1000
freq_threshold = 200
word_freqs = {term: count for term, count in freqs.items() if count >= freq_threshold and term not in stops}
print(len(word_freqs))
label_freqs = data.top5percent_eigen_user.value_counts().to_dict()
label_freqs

20848


{False: 3196174, True: 3052056}

In [65]:
len(data)

6248230

In [81]:
# Second pass to count cooccurrences (and filter by frequency)
cooccurrences = Counter()

def build_cooccurrences(row):
    cooccurrences.update({(row['top5percent_eigen_user'], wd): count for wd, count in Counter(row['content'].lower().split()).items() if wd in word_freqs})

data.progress_apply(build_cooccurrences, axis=1) 

  0%|          | 0/6248230 [00:00<?, ?it/s]

0          None
1          None
2          None
3          None
4          None
           ... 
6248225    None
6248226    None
6248227    None
6248228    None
6248229    None
Length: 6248230, dtype: object

In [82]:
# Save out cooccurrences since it took awhile
import pickle

path = '../tmp/top5percent_eigen_user_cooccurrences.pkl'
with open(path, 'wb') as f:
    pickle.dump(cooccurrences, f)

In [83]:
total_combinations = sum(word_freqs.values()) # total #words
    
from operator import itemgetter
import math
import pdb

def pmi(words, label_freqs, word_freqs, cooccurrences, n):
    """ Args:
            words: query tuple of (label, word)
            label_freqs: dict of label: count
            word_freqs: dict of word: count
            cooccurrences: dict of (label, word): count
            n: number of possible occurrences (number of words or bigrams in the doc)
    """
    numerator = n * cooccurrences[words]
    if numerator == 0:
        return 0
    denominator = label_freqs[words[0]] * word_freqs[words[1]]
    return math.log(numerator/denominator, 2)

def npmi(words, label_freqs, word_freqs, cooccurrences, n):
    """ Normalized pointwise mutual information
        Args:
            words: query tuple of (label, word)
            label_freqs: dict of label: count
            word_freqs: dict of word: count
            cooccurrences: dict of (label, word): count
            n: number of possible occurrences (number of words or bigrams in the doc)
    """
    numerator = pmi(words, label_freqs, word_freqs, cooccurrences, n)
    denominator = -1 * math.log(cooccurrences[words]/n, 2)
    return numerator/denominator

def pmi2(words, label_freqs, word_freqs, cooccurrences, n):
    return pmik(words, label_freqs, word_freqs, cooccurrences, n, 2)

def pmi3(words, label_freqs, word_freqs, cooccurrences, n):
    return pmik(words, label_freqs, word_freqs, cooccurrences, n, 3)

def pmik(words, label_freqs, word_freqs, cooccurrences, n, k):
    """ Args:
            words: query tuple of (label, word)
            label_freqs: dict of label: count
            word_freqs: dict of word: count
            cooccurrences: dict of (label, word): count
            n: number of possible occurrences (number of words or bigrams in the doc)
            k: type of pmik/exponent to use (for example 2 for pmi2 or 3 for pmi3)
    """
    numerator = (cooccurrences[words]/n) ** k
    denominator = (label_freqs[words[0]]/n) * (word_freqs[words[1]]/n)
    return math.log(numerator/denominator, 2)

def top_pmi(word, label_freqs, word_freqs, cooccurrences, n, calculation='pmi'):
    """ 
            words: query tuple of (label, word)
            label_freqs: dict of label: count
            word_freqs: dict of word: count
            cooccurrences: dict of (label, word): count
            n: number of possible occurrences (number of words or bigrams in the doc)
            calculation: type of pmi to run out of {'pmi', 'npmi', 'pmi2', 'pmi3'}
    """
    # Returns top co-occurring words with a specified word based on PMI
    fn = globals()[calculation]
    
    cooccurring_words = []
    
    pairs = [pair for pair in cooccurrences.keys() if word in pair and pair != (word, word)]  # all words that co-occur
    
    for pair in pairs:
        other_word = [w for w in pair if w != word][0]
        cooccurring_words.append((other_word, fn(pair, label_freqs, word_freqs, cooccurrences, n)))
        
    return sorted(cooccurring_words, key=itemgetter(1), reverse=True)

In [95]:
# Look at PMI3 for identity group labels (freq threshold 100)
calculation = 'pmi3'

more_stops = ['still', 'could', 'go', 'also', "n't", "'s", 'like', 'would', 'get', 'nt', 'even', 'people', 'one', "'m", "'re", "ca", '...', 'nan', "'ve"] # also copy above

for top in [True, False]:
    print(top)
    outstring = ', '.join([el[0] for el in top_pmi(top, label_freqs, word_freqs, cooccurrences, total_combinations, calculation=calculation)[:50] if el[0] not in more_stops])
    print(f'{top}: {outstring}')
    print()

True
True: tbh, view, women, think, fuck, good, shit, want, know, attachment, never, men, time, fucking, see, man, incel, chad, life, really, make, foids, much, sex, white, say, jfl, way, got, bro, us, cope, yes

False
False: women, think, men, good, chad, want, fuck, shit, life, know, fucking, never, incel, time, white, much, make, see, look, really, sex, incels, us, ugly, way, foids, guy, got, man, looks, say, guys

