In [93]:
import os
import json
from collections import defaultdict, Counter
import csv
from helpers import get_sr_cats
from nltk.corpus import reuters, brown
import pandas as pd
import networkx as nx

In [2]:
ROOT = '/mnt/data0/lucy/manosphere/'
DATA = ROOT + 'data/'
LOGS = ROOT + 'logs/'

In [3]:
with open(LOGS + 'overall_word_births.json', 'r') as infile: 
    word_births = json.load(infile)

In [4]:
categories = get_sr_cats()
forums = ['FORUM_the_attraction', 'FORUM_pua_forum', 'FORUM_incels', 
          'FORUM_red_pill_talk', 'FORUM_avfm', 'FORUM_rooshv', 'FORUM_red_pill_talk']
mano_communities = set(categories.keys()) | set(forums)
print(mano_communities)

{'philosophyofrape', 'pickup', 'blackpillscience', 'suicidewatch', 'seduction', 'mensrightslinks', 'truefemcels', 'thanktrp', 'mensrights', 'FORUM_pua_forum', 'redpillparenting', 'mgtowbooks', 'femaledatingstrategy', 'FORUM_rooshv', 'incelspurgatory', 'askfds', 'thebluepill', '1ncels', 'mrref', 'malecels', 'foreveralonelondon', 'braincels', 'geotrp', 'supportcel', 'becomeaman', 'incelselfies', 'theglowup', 'pinkpillfeminism', 'pornfreerelationships', 'foreveraloneteens', 'lonelynonviolentmen', 'mgtowmusic', 'fpua', 'alttrp', 'rsd', 'askanincel', 'intactivists', 'inceltears', 'asktrufemcels', 'truecels', 'trufemcels', 'asktrp', 'masculism', 'FORUM_incels', 'foreveralone', 'trpofftopic', 'mgtow', 'FORUM_the_attraction', 'puascience', 'gold_digger', 'ladymras', 'fdssuperfans', 'marriedredpill', 'redpillwomen', 'mensrightslaw', 'gymcels', 'depression', 'gaycel', 'redpillwives', 'foreverunwanted', 'FORUM_red_pill_talk', 'pussypass', 'mractivism', 'inceldense', 'foreveralonedating', 'FORUM_a

# Get lexical innovation vocabulary

First, we find a suitable cutoff for words that we can ensure they are actually new (and not just new because they are part of general language and some communities are earlier in the dataset than others. 

We remove bigrams whose second token also appears in our vocabulary, since those tend to be innovative phrases less often. This removes terms such as "most girls" yet retains many innovative phrase such as "carousel riders". 

We use the reuters corpus (created in 2000) to filter out non-innovative words. 

In [5]:
reuters_words = set([w.lower() for w in reuters.words()])
brown_words = set([w.lower() for w in brown.words()])
print(list(reuters_words)[:10])

['electricians', 'repairs', 'bonus', 'kassenobligation', 'unavoidable', 'black', 'ipla', 'contemplated', 'friedhelm', 'vlissingen']


In [6]:
common_sources = Counter()
yearly_innovations = defaultdict(list) # {year: [list of innovations]}
for w in word_births: 
    if w in reuters_words or w in brown_words: continue # not an innovation
    if w.endswith('s'): 
        if w[:-1] in reuters_words or w[:-1] in brown_words: continue # not an innovation
    if ' ' in w: # bigram
        toks = w.split(' ')
        if toks[-1] in word_births or toks[-1] in reuters_words or toks[-1] in brown_words: continue
    content = word_births[w]
    date = content[0]
    year = date.split('-')[0]
    sources = content[1]
    # uncomment below if you want to find words that only manosphere communities pioneeered
    #if set(sources) & mano_communities: 
    yearly_innovations[year].append(w)
    common_sources.update(sources)

In [7]:
common_sources.most_common()

[('FORUM_the_attraction', 577),
 ('reddit.com', 343),
 ('FORUM_red_pill_talk', 39),
 ('politics', 12),
 ('askreddit', 11),
 ('mensrights', 11),
 ('incels', 10),
 ('programming', 9),
 ('FORUM_pua_forum', 8),
 ('funny', 5),
 ('pics', 4),
 ('thebluepill', 4),
 ('science', 4),
 ('wtf', 4),
 ('drama', 4),
 ('shitredditsays', 4),
 ('the_donald', 3),
 ('atheism', 3),
 ('seduction', 3),
 ('entertainment', 3),
 ('theredpill', 3),
 ('business', 3),
 ('news', 2),
 ('FORUM_rooshv', 2),
 ('4chan4trump', 2),
 ('mgtow', 2),
 ('purplepilldebate', 2),
 ('askmen', 2),
 ('videos', 2),
 ('worldnews', 2),
 ('antipozi', 1),
 ('kotakuinaction', 1),
 ('puahate', 1),
 ('waluigi', 1),
 ('rupaulsdragrace', 1),
 ('wikipedia', 1),
 ('oney', 1),
 ('features', 1),
 ('badhistory', 1),
 ('netsec', 1),
 ('gadgets', 1),
 ('gatekeeping', 1),
 ('anarchism', 1),
 ('askwomen', 1),
 ('cringeanarchy', 1),
 ('bestof', 1),
 ('justneckbeardthings', 1),
 ('vegan', 1),
 ('penis', 1),
 ('bacon', 1),
 ('equality', 1),
 ('egalitarian

In [8]:
for year in range(2005, 2010): 
    print(year, yearly_innovations[str(year)])
    print()

2005 ['gf', 'assholes', 'eachother', 'chodes', 'roomate', 'amogs', 'hypnotist', 'goofball', 'geek', 'cunt', 'fuckers', 'cunts', 'hippies', 'masseuse', 'enablers', 'oddball', 'newb', 'toastmasters', 'joker', 'narcissist', 'chubby chasers', 'freelancers', 'shaman', 'wackos', 'boyz', 'infj', 'server', 'keyboard jockeys', 'bloggers', 'noobie', 'prude', 'rapist', 'exes', 'shorty', 'gamer', 'brat', 'perverts', 'womanizer', 'dudes', 'girlfriends', 'smartass', 'hb2', 'stalker', 'stalkers', 'sluts', 'douche', 'noob', 'hb10', 'lifeguard', 'boyfriends', 'nerd', 'hotties', 'dumbass', 'weirdos', 'jerkoff', 'mods', 'moron', 'niceguy', 'barmaid', 'morons', 'jedi', 'wanker', 'selector', 'hb4', 'girlfriend', 'dude', 'slut', 'surfers', 'druggies', 'caveman', 'pornstar', 'hb7', 'bouncers', 'baristas', 'airhead', 'hb5', 'hb6', 'puas', 'misfits', 'hacker', 'toddler', 'paedophile', 'filmmaker', 'commenters', 'pua', 'hb9', 'wingmen', 'dork', 'nerds', 'doofus', 'cheerleader', 'pervert', 'amog', 'babysitter', 

In [9]:
innov_vocab = set()
for year in range(2007, 2020): 
    print(year, yearly_innovations[str(year)])
    innov_vocab.update(yearly_innovations[str(year)])
    print()

2007 ['millennials', 'normies', 'hikikomori', 'aspies', 'sahm', 'autists', 'wimmenz', 'beckys', 'wymyn', 'fucktoy', 'randos', 'larpers', 'neurotypicals', 'stacie', 'landwhale', 'hambeasts', 'weeaboo', 'womyns', 'people irl', 'incel', 'fembot', 'camwhore', 'sex bots', 'cosplayers', 'cum dumpsters', 'mgtow', 'globalists', 'old hags', 'masculist', 'equalist', 'sex bot', 'dravidian', 'friendo', 'radfems', 'terf', 'foid', 'camgirl', 'femnazis', 'xw', 'infiltrator', 'wageslave', 'cuckolds', 'femnazi', 'intactivists', 'homebody', 'antifa', 'aspie', 'hambeast', 'female supremacist', 'batterers', 'cum dumpster', 'mra', 'transpeople', 'filipinas', 'stepdaughter', 'assaulter', 'transwoman', 'sloot', 'sexbot', 'sexbots', 'baby daddies', 'manlet', 'influencer', 'larper', 'feminsts', 'mras', 'prepper', 'honey badgers', 'tyrones', 'transwomen', 'cuckold', 'womenz', 'batterer', 'camgirls', 'masculinist', 'cumdumpster', 'femails', 'faer']

2008 ['women irl', 'weebs', 'suffragists', 'rape apologists', '

## Analyze source and destinations of lexical innovations

In [10]:
print(len(innov_vocab))
with open(LOGS + 'lexical_innovations.txt', 'w') as outfile: 
    for w in innov_vocab: 
        outfile.write(w + '\n')

254


In [53]:
common_sources = Counter()
mano_source_words = set()
outside_source_words = set()
for w in word_births: 
    if w not in innov_vocab: continue
    content = word_births[w]
    date = content[0]
    year = date.split('-')[0]
    if set(sources) & mano_communities: 
        mano_source_words.add(w)
    else: 
        outside_source_words.add(w)
    sources = content[1]
    common_sources.update(sources)

In [54]:
common_sources.most_common(30)

[('reddit.com', 65),
 ('FORUM_red_pill_talk', 39),
 ('FORUM_the_attraction', 16),
 ('politics', 12),
 ('askreddit', 11),
 ('mensrights', 11),
 ('incels', 10),
 ('FORUM_pua_forum', 7),
 ('funny', 5),
 ('pics', 4),
 ('thebluepill', 4),
 ('science', 4),
 ('wtf', 4),
 ('drama', 4),
 ('programming', 4),
 ('shitredditsays', 4),
 ('the_donald', 3),
 ('atheism', 3),
 ('seduction', 3),
 ('entertainment', 3),
 ('theredpill', 3),
 ('business', 3),
 ('news', 2),
 ('FORUM_rooshv', 2),
 ('4chan4trump', 2),
 ('mgtow', 2),
 ('purplepilldebate', 2),
 ('askmen', 2),
 ('videos', 2),
 ('worldnews', 2)]

In [15]:
top_n_subreddits = set()
with open(DATA + 'all_reddit_post_counts/top_subreddits.txt', 'r') as infile: 
    line_count = 0
    for line in infile: 
        top_n_subreddits.add(line.strip().split(' ')[0])
        line_count += 1
        if line_count == 500: break
print(len(top_n_subreddits))
for source in common_sources: 
    if source.startswith('FORUM_'): continue
    if source not in top_n_subreddits and source not in mano_communities: 
        print(source)

500
antipozi
kotakuinaction
puahate
waluigi
wikipedia
oney
4chan4trump
badhistory
netsec
gatekeeping
anarchism
purplepilldebate
justneckbeardthings
drama
bacon
equality
egalitarian
cooking
computebazaar
buildapcsales
shitredditsays
transgender
subredditdrama
diablo
tumblrinaction
cuckold
dirtykik
feminisms
soccerspirits
nofapjuly
pfjerk
4chan
celebrities
meetup
gay
sidehugs
gaybros
fitnesscirclejerk
pets
cortexcommand
theoryofreddit


In [14]:
print(mano_source_words)
print()
print(outside_source_words)

{'normie scum', 'hambeast', 'transwomen', 'blackcel', 'faer', 'oldcel', 'foid', 'escortcels', 'brigaders', 'influencer', 'gymcel', 'chadpreet', 'fuckboy', 'poorcel', 'volcel', 'redpiller', 'looksmatches', 'larpers', 'chadcel', 'norwood reaper', 'roasties', 'cuckqueers', 'sexbot', 'chaddam', 'feminsts', 'suffragists', 'red piller', 'chadlet', 'locationcel', 'ricecels', 'itcels', 'femails', 'oldcels', 'mentalcel', 'cuckservatives', 'tradcuck', 'sub8s', 'beta bux', 'ethnicel', 'camwhore', 'heightcel', 'fakecels', 'intactivist', 'manlet', 'chadlite', 'wymyn', 'friendo', 'rapefugees', 'stepdaughter', 'manlets', 'richcel', 'turbomanlet', 'bluepiller', 'mentalcels', 'aspies', 'landwhale', 'manginas', 'edgelord', 'trper', 'looksmatch', 'srsers', 'sluthaters', 'cuckcel', 'femcel', 'youngcels', 'wahmyn', 'gigachads', 'youngcel', 'khhv', 'numales', 'landwhales', 'shitlord', 'manchildren', 'wristcel', 'noodlewhore', 'wahmen', 'weebs', 'terfs', 'sex bot', 'intactivists', 'sedditors', 'women irl', '

In [55]:
plural = set()
singular = set()
for w in innov_vocab: 
    if w.endswith('s'): 
        plural.add(w)
    else: 
        singular.add(w)
print(plural)
print()
print(singular)

{'camgirls', 'fhos', 'tyrones', 'shitskins', 'mgtowers', 'moids', 'antifeminists', 'randos', 'foids', 'cucktears', 'escortcels', 'brigaders', 'redpillers', 'bpers', 'mgtows', 'femnazis', 'looksmatches', 'stacies', 'larpers', 'womyns', 'normies', 'beckys', 'rvfers', 'jailbaits', 'roasties', 'mras', 'fuckboys', 'currycels', 'femoids', 'cuckqueers', 'feminsts', 'suffragists', 'ethniks', 'soyboys', 'female protagonists', 'tradthots', 'wagecucks', 'filipinas', 'ethnicels', 'ricecels', 'chadlites', 'itcels', 'blue pillers', 'besties', 'femails', 'oldcels', 'dark triads', 'shitlords', 'sub8s', 'cuckservatives', 'faers', 'cucks', 'noodlewhores', 'gymcels', 'female supremacists', 'fakecels', 'autists', 'incels', 'rapefugees', 'normtards', 'manlets', 'mentalcels', 'punjabis', 'aspies', 'femcels', 'rape apologists', 'manginas', 'neurotypicals', 'bronies', 'honey badgers', 'cosplayers', 'baby daddies', 'old hags', 'srsers', 'sluthaters', 'hambeasts', 'sex bots', 'youngcels', 'gigachads', 'trpers',

### Sustained periods in mainstream

Run `mainstream_sustained_periods()` in `gram_counting.py`.

In [83]:
with open(LOGS + 'sustained_mainstream.json', 'r') as infile: 
    mainstream_sp = json.load(infile)

In [84]:
print(len(mainstream_sp))

107


Run `manosphere_sustained_periods()` in `gram_counting.py`.

In [85]:
with open(LOGS + 'sustained_manosphere.json', 'r') as infile: 
    mano_sp = json.load(infile)

In [100]:
network_edges = [] # [(source, destination)]
manosphere_lead_words = []
manosphere_com = set()
mainstream_com = set()
for w in mainstream_sp: 
    min_month = 'z'
    for sr in mainstream_sp[w]: 
        mainstream_com.add(sr)
        start = mainstream_sp[w][sr][0]
        min_month = min(start, min_month)
    if w not in mano_sp: continue
    print(w, len(mainstream_sp[w]), list(mainstream_sp[w].keys()))
    print('MAINSTREAM:', min_month)
    main_min_month = min_month
    min_month = 'z'
    for sr in mano_sp[w]: 
        manosphere_com.add(sr)
        start = mano_sp[w][sr][0]
        min_month = min(start, min_month)
    print(w, len(mano_sp[w]), list(mano_sp[w].keys()))
    print("MANOSPHERE:", min_month)
    mano_min_month = min_month
    if mano_min_month < main_min_month: 
        print("****MANOSPHERE LEAD")
        manosphere_lead_words.append(w)
        leads = []
        for sr in mano_sp[w]: 
            start = mano_sp[w][sr][0]
            if start == mano_min_month: 
                leads.append(sr)
        dest = mainstream_sp[w]
        for l in leads: 
            for d in dest: 
                network_edges.append((l, d))
    print()

millennials 92 ['Bitcoin', 'PoliticalHumor', 'baseball', 'relationships', 'politics', 'dankmemes', 'im14andthisisdeep', 'technology', 'atheism', 'AskMen', 'pics', 'ukpolitics', 'AdviceAnimals', 'Entrepreneur', 'gaming', 'CFB', 'RoastMe', 'facepalm', 'Music', 'PewdiepieSubmissions', 'Philippines', 'canada', 'aww', 'personalfinance', 'exmormon', 'europe', 'nba', 'OldSchoolCool', 'mildlyinteresting', 'gifs', 'australia', 'OutOfTheLoop', 'tifu', 'funny', 'FortNiteBR', 'HistoryMemes', 'news', 'Christianity', 'videos', 'worldpolitics', 'AskWomen', 'The_Donald', 'nottheonion', 'books', 'modernwarfare', 'CryptoCurrency', 'science', 'DestinyTheGame', 'Libertarian', 'cars', 'Conservative', 'trashy', 'AskReddit', 'me_irl', 'ComedyCemetery', 'Showerthoughts', 'POLITIC', 'BlackPeopleTwitter', 'LifeProTips', 'Economics', 'movies', 'worldnews', 'SandersForPresident', 'mildlyinfuriating', 'cringepics', 'unitedkingdom', 'wallstreetbets', 'india', 'wholesomememes', 'television', 'rupaulsdragrace', 'nfl'

In [89]:
print(len(manosphere_lead_words))
print(manosphere_lead_words)

26
['cuck', 'incels', 'cucks', 'incel', 'mgtow', 'femoid', 'femoids', 'femcel', 'mra', 'mras', 'autists', 'normie scum', 'transwomen', 'menz', 'manlets', 'transwoman', 'cuckolds', 'landwhale', 'sexbot', 'neurotypicals', 'radfems', 'aspies', 'volcel', 'volcels', 'femcels', 'mgtows']


In [110]:
network_weights = Counter(network_edges)
G = nx.DiGraph()
edge_names = {'FORUM_red_pill_talk': 'Red Pill Talk', 'FORUM_mgtow': 'MGTOW'}
for edge in network_weights: 
    start = edge[0]
    if start.startswith('FORUM_'): 
        start = edge_names[start]
    G.add_edge(start, edge[1], weight=network_weights[edge])
for n in G.nodes: 
    if n in set(edge_names.values()): 
        G.nodes[n]['dataset'] = 'forum'
    elif n in manosphere_com: 
        G.nodes[n]['dataset'] = 'mano'
    else: 
        G.nodes[n]['dataset'] = 'main'
nx.write_gexf(G, LOGS + "word_spread.gexf")

In [111]:
leads = Counter()
dest = Counter()
for tup in network_weights: 
    leads[tup[0]] += 1
    dest[tup[1]] += 1
print(leads.most_common(10))
print(dest.most_common(10))

[('FORUM_red_pill_talk', 133), ('theredpill', 115), ('mensrights', 47), ('incels', 3), ('braincels', 2), ('FORUM_mgtow', 2), ('mgtow', 1)]
[('teenagersnew', 4), ('dankmemes', 4), ('The_Donald', 4), ('AskReddit', 4), ('ChapoTrapHouse', 4), ('unpopularopinion', 4), ('politics', 3), ('AskMen', 3), ('pics', 3), ('gaming', 3)]


The following are month ranges where we should be sampling up to 50 instances of each word. 

The union of these ranges, we should also sample up to 50 instances of each word in the manosphere for that same time period. 

In [128]:
for w in manosphere_lead_words: 
    print(w, len(mainstream_sp[w]))
#     for sr in mainstream_sp[w]: 
#         print(w, sr, mainstream_sp[w][sr])

cuck 99
incels 89
cucks 37
incel 115
mgtow 27
femoid 1
femoids 1
femcel 2
mra 29
mras 15
autists 14
normie scum 1
transwomen 10
menz 1
manlets 2
transwoman 9
cuckolds 3
landwhale 2
sexbot 1
neurotypicals 2
radfems 1
aspies 1
volcel 1
volcels 1
femcels 1
mgtows 1


### Lexical variables for "women"

From examining examples of these in use, it seems like the spelling variations are for mocking statements made by feminists and other people about "strong independent \[women variant\]" or "empowered \[women variant\]". 

In [138]:
ANN_FILE = ROOT + 'data/ann_sig_entities.csv'

In [119]:
words = []
with open(ANN_FILE, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader: 
        if row['keep'] == 'Y': 
            words.append(row['entity'])

In [123]:
w_words = [w for w in words if w.startswith('w') and ' ' not in w]
print(w_words)

['women', 'woman', 'wife', 'whore', 'whores', 'wives', 'whites', 'writer', 'wingman', 'winner', 'waitress', 'writers', 'weirdo', 'witnesses', 'winners', 'warriors', 'wizard', 'waiter', 'westerners', 'waitresses', 'wingmen', 'weirdos', 'wifey', 'wuss', 'wymyn', 'widows', 'wimp', 'wifes', 'wahmen', 'witches', 'whitey', 'wimminz', 'wrestler', 'waiters', 'westerner', 'weakling', 'womanizer', 'wanker', 'wimmin', 'wingwoman', 'wizards', 'weebs', 'workaholic', 'wimps', 'weaklings', 'warlord', 'welder', 'wannabes', 'whiner', 'wench', 'wasps', 'wamen', 'whamen', 'womenz', 'wrestlers', 'workhorse', 'wahmyn', 'weemins', 'wallflower', 'womankind', 'wks', 'warlords', 'widower', 'wenches', 'welders', 'wimmenz', 'warden', 'wierdo', 'weeaboo', 'womyns', 'workmates', 'wearer', 'wive', 'womanizers', 'wagecucks', 'wimmins', 'weenie', 'womynz', 'wingwomen', 'wackos', 'wussy', 'wageslave', 'wristcel']


In [129]:
women_vars = ['women', 'wymyn', 'wahmen', 'wimminz', 'wimmin', 'wamen', 'whamen', 'womenz', 'wahmyn', 
              'weemins', 'wimmenz', 'womyns', 'wimmins', 'womynz', 'femoids', 'foids']
print(len(women_vars))
print(', '.join(women_vars))

16
women, wymyn, wahmen, wimminz, wimmin, wamen, whamen, womenz, wahmyn, weemins, wimmenz, womyns, wimmins, womynz, femoids, foids


In [136]:
with open(LOGS + 'sustained_women.json', 'r') as infile: 
    women_sp = json.load(infile)

In [137]:
for w in women_sp: 
    for sr in women_sp[w]: 
        print(w, sr, women_sp[w][sr])

women maleforeveralone ['2017-12', '2018-05']
women foreverunwanted ['2016-02', '2017-02']
women askanincel ['2018-11', '2019-10']
women ladymras ['2012-04', '2012-12']
women inceldense ['2018-06', '2018-09']
women FORUM_red_pill_talk ['2014-05', '2015-06']
women FORUM_incels ['2017-11', '2019-06']
women egalitarianism ['2011-04', '2011-07']
women supportcel ['2017-10', '2018-01']
women FORUM_avfm ['2012-09', '2019-03']
women braincels ['2017-11', '2019-08']
women asktrp ['2013-08', '2019-12']
women redpillwomen ['2013-06', '2019-12']
women asktrufemcels ['2018-10', '2019-12']
women askseddit ['2011-02', '2019-12']
women incelselfies ['2018-08', '2019-06']
women intactivists ['2013-07', '2013-11']
women femaledatingstrategy ['2019-05', '2019-12']
women incelswithouthate ['2017-08', '2019-12']
women pickup ['2017-06', '2017-08']
women foreveralone ['2011-01', '2019-12']
women truecels ['2016-01', '2016-05']
women blackpillscience ['2018-04', '2019-12']
women FORUM_pua_forum ['2006-04', 