In [108]:
import os
import json
from collections import defaultdict, Counter
import csv
from helpers import get_sr_cats
from nltk.corpus import reuters

In [109]:
ROOT = '/mnt/data0/lucy/manosphere/'
DATA = ROOT + 'data/'
LOGS = ROOT + 'logs/'

In [110]:
with open(LOGS + 'overall_word_births.json', 'r') as infile: 
    word_births = json.load(infile)

In [111]:
categories = get_sr_cats()
forums = ['FORUM_the_attraction', 'FORUM_pua_forum', 'FORUM_incels', 
          'FORUM_red_pill_talk', 'FORUM_avfm', 'FORUM_rooshv', 'FORUM_red_pill_talk']
mano_communities = set(categories.keys()) | set(forums)
print(mano_communities)

{'asktrp', 'pua', 'theredpill', 'FORUM_rooshv', 'mensrights', 'mensrightslinks', 'thanktrp', 'marriedredpill', 'femalelevelupstrategy', 'incels', 'redpillparenting', 'alttrp', 'fpua', 'malecels', 'asktrufemcels', 'braincels', 'incelbrotherhood', 'askanincel', 'redpillwives', 'suicidewatch', 'pickup', 'askfds', 'redpillwomen', 'askseddit', 'FORUM_red_pill_talk', 'seduction', 'blackpillscience', 'exredpill', 'FORUM_the_attraction', 'intactivists', 'puascience', 'egalitarianism', 'incelspurgatory', 'mgtow', 'fdssuperfans', 'maleforeveralone', 'becomeaman', 'rsd', 'mgtowbooks', 'pinkpillfeminism', '1ncels', 'femaledatingstrategy', 'mractivism', 'inceltears', 'gaycel', 'supportcel', 'truecels', 'foreveralone', 'ladymras', 'theglowup', 'mrref', 'incelswithouthate', 'truefemcels', 'inceldense', 'FORUM_pua_forum', 'trpofftopic', 'depression', 'pussypass', 'gold_digger', 'lonelynonviolentmen', 'mensrightslaw', 'gymcels', 'FORUM_incels', 'socialanxiety', 'foreverunwanted', 'FORUM_avfm', 'philoso

## Get lexical innovation vocabulary

First, we find a suitable cutoff for words that we can ensure they are actually new (and not just new because they are part of general language and some communities are earlier in the dataset than others. 

We remove bigrams whose second token also appears in our vocabulary, since those tend to be innovative phrases less often. This removes terms such as "most girls" yet retains many innovative phrase such as "carousel riders". 

We use the reuters corpus (created in 2000) to filter out non-innovative words. 

In [112]:
reuters_words = set([w.lower() for w in reuters.words()])
print(list(reuters_words)[:10])

['worries', 'created', 'bindley', '239', 'judge', 'matsuya', 'refers', 'rainfalls', '257', 'austec']


In [113]:
common_sources = Counter()
yearly_innovations = defaultdict(list) # {year: [list of innovations]}
for w in word_births: 
    if w in reuters_words: continue # not an innovation
    if ' ' in w: # bigram
        toks = w.split(' ')
        if toks[-1] in word_births or toks[-1] in reuters_words: continue
    content = word_births[w]
    date = content[0]
    year = date.split('-')[0]
    sources = content[1]
    # uncomment below if you wnat to find words that only manosphere communities pioneeered
    #if set(sources) & mano_communities: 
    yearly_innovations[year].append(w)
    common_sources.update(sources)

In [122]:
common_sources.most_common()

[('reddit.com', 78),
 ('FORUM_red_pill_talk', 39),
 ('FORUM_the_attraction', 19),
 ('politics', 12),
 ('mensrights', 12),
 ('askreddit', 11),
 ('incels', 10),
 ('FORUM_pua_forum', 8),
 ('programming', 5),
 ('funny', 5),
 ('pics', 4),
 ('thebluepill', 4),
 ('science', 4),
 ('wtf', 4),
 ('drama', 4),
 ('shitredditsays', 4),
 ('the_donald', 3),
 ('FORUM_rooshv', 3),
 ('atheism', 3),
 ('seduction', 3),
 ('entertainment', 3),
 ('theredpill', 3),
 ('business', 3),
 ('news', 2),
 ('4chan4trump', 2),
 ('mgtow', 2),
 ('purplepilldebate', 2),
 ('askmen', 2),
 ('videos', 2),
 ('worldnews', 2),
 ('antipozi', 1),
 ('kotakuinaction', 1),
 ('puahate', 1),
 ('waluigi', 1),
 ('rupaulsdragrace', 1),
 ('wikipedia', 1),
 ('oney', 1),
 ('fffffffuuuuuuuuuuuu', 1),
 ('orangecounty', 1),
 ('losangeles', 1),
 ('badhistory', 1),
 ('netsec', 1),
 ('gadgets', 1),
 ('gatekeeping', 1),
 ('anarchism', 1),
 ('askwomen', 1),
 ('cringeanarchy', 1),
 ('bestof', 1),
 ('justneckbeardthings', 1),
 ('vegan', 1),
 ('penis', 

In [115]:
for year in range(2005, 2010): 
    print(year, yearly_innovations[str(year)])
    print()

2005 ['dads', 'males', 'babies', 'gf', 'seducer', 'strangers', 'assholes', 'eachother', 'feminists', 'soldier', 'chodes', 'masses', 'roomate', 'shooter', 'amogs', 'hypnotist', 'goofball', 'rockers', 'parents', 'geek', 'cunt', 'diva', 'moms', 'wuss', 'fuckers', 'cunts', 'hippies', 'masseuse', 'snobs', 'thugs', 'lunatic', 'idols', 'seducers', 'moderator', 'enablers', 'future generations', 'oddball', 'rangers', 'newb', 'professors', 'pals', 'realtor', 'doc', 'hackers', 'chics', 'intellectuals', 'toastmasters', 'assassin', 'zealots', 'poor sap', 'assailants', 'partisans', 'lovers', 'gays', 'bartenders', 'joker', 'pastor', 'narcissist', 'hosts', 'gypsy', 'mentors', 'chubby chasers', 'freelancers', 'shaman', 'citizenry', 'buddies', 'poet', 'asians', 'grandmother', 'lady', 'male gender', 'idiots', 'spectator', 'chick magnet', 'college grad', 'wackos', 'boyz', 'infj', 'texan', 'teenager', 'server', 'siblings', 'enemies', 'adults', 'cheerleaders', 'serial killers', 'bunny', 'bullies', 'twins', 

In [116]:
innov_vocab = set()
for year in range(2007, 2020): 
    print(year, yearly_innovations[str(year)])
    innov_vocab.update(yearly_innovations[str(year)])
    print()

2007 ['millennials', 'trps', 'normies', 'cads', 'hikikomori', 'aspies', 'sahm', 'autists', 'wimmenz', 'beckys', 'wymyn', 'fucktoy', 'randos', 'larpers', 'neurotypicals', 'stacie', 'landwhale', 'hambeasts', 'weeaboo', 'womyns', 'school shooters', 'people irl', 'suffragettes', 'incel', 'fembot', 'camwhore', 'sex bots', 'cosplayers', 'trollops', 'cum dumpsters', 'mgtow', 'sex robot', 'globalists', 'bps', 'old hags', 'masculist', 'equalist', 'sex bot', 'dravidian', 'friendo', 'radfems', 'terf', 'foid', 'camgirl', 'femnazis', 'xw', 'infiltrator', 'wageslave', 'young pussy', 'cuckolds', 'femnazi', 'intactivists', 'rape apologist', 'homebody', 'antifa', 'aspie', 'basement dweller', 'white knights', 'hambeast', 'female supremacist', 'batterers', 'cum dumpster', 'mra', 'transpeople', 'filipinas', 'nonwhites', 'stepdaughter', 'assaulter', 'transwoman', 'sloot', 'sexbot', 'sexbots', 'baby daddies', 'manlet', 'influencer', 'larper', 'feminsts', 'nordics', 'mras', 'prepper', 'honey badgers', 'tyron

## Analyze source and destinations of lexical innovations

Only 212 words were born during/after 2008 that fit our innovation criteria, but if we filter down to only those that are born in reddit\_rel or forum\_rel, we get only 92 terms. Since it doesn't cost that much to track 20 additional terms, I included them. This means the network is not necessarily bipartite. 

After filtering out words and last tokens that are in Reuters, then we get 186 terms.  

In [125]:
print(len(innov_vocab))
with open(LOGS + 'lexical_innovations.txt', 'w') as outfile: 
    for w in innov_vocab: 
        outfile.write(w + '\n')

278


In [126]:
common_sources = Counter()
mano_source_words = set()
outside_source_words = set()
for w in word_births: 
    if w not in innov_vocab: continue
    content = word_births[w]
    date = content[0]
    year = date.split('-')[0]
    if set(sources) & mano_communities: 
        mano_source_words.add(w)
    else: 
        outside_source_words.add(w)
    sources = content[1]
    common_sources.update(sources)

In [127]:
common_sources.most_common(30)

[('reddit.com', 78),
 ('FORUM_red_pill_talk', 39),
 ('FORUM_the_attraction', 19),
 ('politics', 12),
 ('mensrights', 12),
 ('askreddit', 11),
 ('incels', 10),
 ('FORUM_pua_forum', 8),
 ('programming', 5),
 ('funny', 5),
 ('pics', 4),
 ('thebluepill', 4),
 ('science', 4),
 ('wtf', 4),
 ('drama', 4),
 ('shitredditsays', 4),
 ('the_donald', 3),
 ('FORUM_rooshv', 3),
 ('atheism', 3),
 ('seduction', 3),
 ('entertainment', 3),
 ('theredpill', 3),
 ('business', 3),
 ('news', 2),
 ('4chan4trump', 2),
 ('mgtow', 2),
 ('purplepilldebate', 2),
 ('askmen', 2),
 ('videos', 2),
 ('worldnews', 2)]

In [133]:
top_n_subreddits = set()
with open(DATA + 'all_reddit_post_counts/top_subreddits.txt', 'r') as infile: 
    line_count = 0
    for line in infile: 
        top_n_subreddits.add(line.strip().split(' ')[0])
        line_count += 1
        if line_count == 500: break
for source in common_sources: 
    if source.startswith('FORUM_'): continue
    if source not in top_n_subreddits and source not in mano_communities: 
        print(source)

antipozi
kotakuinaction
puahate
waluigi
wikipedia
oney
orangecounty
losangeles
4chan4trump
badhistory
netsec
gatekeeping
anarchism
purplepilldebate
justneckbeardthings
drama
bacon
equality
egalitarian
cooking
computebazaar
buildapcsales
shitredditsays
transgender
subredditdrama
diablo
tumblrinaction
cuckold
dirtykik
feminisms
soccerspirits
timetravel
nofapjuly
pfjerk
4chan
celebrities
meetup
gay
sidehugs
gaybros
fitnesscirclejerk
pets
cortexcommand
theoryofreddit


In [120]:
print(mano_source_words)
print()
print(outside_source_words)

{'chadlite', 'chadlet', 'women irl', 'femcel', 'intactivist', 'rationalization hamster', 'friendo', 'wahmen', 'landwhale', 'fakecel', 'edgelord', 'female supremacists', 'manginas', 'itcels', 'sedditors', 'incels', 'fuckboy', 'sex bot', 'femails', 'manlet', 'sub8s', 'bluepiller', 'faer', 'cuckcel', 'rapefugees', 'camwhore', 'trper', 'chaddam', 'beta bux', 'senpai', 'turbomanlet', 'bps', 'terfs', 'batterer', 'red piller', 'wahmyn', 'heightcel', 'truecels', 'redpiller', 'roasties', 'femoid', 'catfishman', 'srsers', 'ricecels', 'locationcel', 'wristcel', 'incel', 'tradcuck', 'chadpreet', 'norwood reaper', 'cuckservatives', 'sexbot', 'transwomen', 'larpers', 'numales', 'influencer', 'foid', 'ethnicel', 'sluthaters', 'hambeast', 'mentalcel', 'sluthater', 'mentalcels', 'escortcels', 'landwhales', 'oldcels', 'brigaders', 'weebs', 'cuckqueers', 'gigachads', 'shitlord', 'poorcel', 'khhv', 'manlets', 'fakecels', 'aspies', 'intactivists', 'looksmatches', 'gymcel', 'millennials', 'blackcels', 'wome