In [1]:
import os
import json
from collections import defaultdict, Counter
import csv
from helpers import get_sr_cats
from nltk.corpus import reuters, brown

In [2]:
ROOT = '/mnt/data0/lucy/manosphere/'
DATA = ROOT + 'data/'
LOGS = ROOT + 'logs/'

In [3]:
with open(LOGS + 'overall_word_births.json', 'r') as infile: 
    word_births = json.load(infile)

In [4]:
categories = get_sr_cats()
forums = ['FORUM_the_attraction', 'FORUM_pua_forum', 'FORUM_incels', 
          'FORUM_red_pill_talk', 'FORUM_avfm', 'FORUM_rooshv', 'FORUM_red_pill_talk']
mano_communities = set(categories.keys()) | set(forums)
print(mano_communities)

{'philosophyofrape', 'pickup', 'blackpillscience', 'suicidewatch', 'seduction', 'mensrightslinks', 'truefemcels', 'thanktrp', 'mensrights', 'FORUM_pua_forum', 'redpillparenting', 'mgtowbooks', 'femaledatingstrategy', 'FORUM_rooshv', 'incelspurgatory', 'askfds', 'thebluepill', '1ncels', 'mrref', 'malecels', 'foreveralonelondon', 'braincels', 'geotrp', 'supportcel', 'becomeaman', 'incelselfies', 'theglowup', 'pinkpillfeminism', 'pornfreerelationships', 'foreveraloneteens', 'lonelynonviolentmen', 'mgtowmusic', 'fpua', 'alttrp', 'rsd', 'askanincel', 'intactivists', 'inceltears', 'asktrufemcels', 'truecels', 'trufemcels', 'asktrp', 'masculism', 'FORUM_incels', 'foreveralone', 'trpofftopic', 'mgtow', 'FORUM_the_attraction', 'puascience', 'gold_digger', 'ladymras', 'fdssuperfans', 'marriedredpill', 'redpillwomen', 'mensrightslaw', 'gymcels', 'depression', 'gaycel', 'redpillwives', 'foreverunwanted', 'FORUM_red_pill_talk', 'pussypass', 'mractivism', 'inceldense', 'foreveralonedating', 'FORUM_a

## Get lexical innovation vocabulary

First, we find a suitable cutoff for words that we can ensure they are actually new (and not just new because they are part of general language and some communities are earlier in the dataset than others. 

We remove bigrams whose second token also appears in our vocabulary, since those tend to be innovative phrases less often. This removes terms such as "most girls" yet retains many innovative phrase such as "carousel riders". 

We use the reuters corpus (created in 2000) to filter out non-innovative words. 

In [5]:
reuters_words = set([w.lower() for w in reuters.words()])
brown_words = set([w.lower() for w in brown.words()])
print(list(reuters_words)[:10])

['electricians', 'repairs', 'bonus', 'kassenobligation', 'unavoidable', 'black', 'ipla', 'contemplated', 'friedhelm', 'vlissingen']


In [6]:
common_sources = Counter()
yearly_innovations = defaultdict(list) # {year: [list of innovations]}
for w in word_births: 
    if w in reuters_words or w in brown_words: continue # not an innovation
    if w.endswith('s'): 
        if w[:-1] in reuters_words or w[:-1] in brown_words: continue # not an innovation
    if ' ' in w: # bigram
        toks = w.split(' ')
        if toks[-1] in word_births or toks[-1] in reuters_words or toks[-1] in brown_words: continue
    content = word_births[w]
    date = content[0]
    year = date.split('-')[0]
    sources = content[1]
    # uncomment below if you want to find words that only manosphere communities pioneeered
    #if set(sources) & mano_communities: 
    yearly_innovations[year].append(w)
    common_sources.update(sources)

In [7]:
common_sources.most_common()

[('FORUM_the_attraction', 577),
 ('reddit.com', 343),
 ('FORUM_red_pill_talk', 39),
 ('politics', 12),
 ('askreddit', 11),
 ('mensrights', 11),
 ('incels', 10),
 ('programming', 9),
 ('FORUM_pua_forum', 8),
 ('funny', 5),
 ('pics', 4),
 ('thebluepill', 4),
 ('science', 4),
 ('wtf', 4),
 ('drama', 4),
 ('shitredditsays', 4),
 ('the_donald', 3),
 ('atheism', 3),
 ('seduction', 3),
 ('entertainment', 3),
 ('theredpill', 3),
 ('business', 3),
 ('news', 2),
 ('FORUM_rooshv', 2),
 ('4chan4trump', 2),
 ('mgtow', 2),
 ('purplepilldebate', 2),
 ('askmen', 2),
 ('videos', 2),
 ('worldnews', 2),
 ('antipozi', 1),
 ('kotakuinaction', 1),
 ('puahate', 1),
 ('waluigi', 1),
 ('rupaulsdragrace', 1),
 ('wikipedia', 1),
 ('oney', 1),
 ('features', 1),
 ('badhistory', 1),
 ('netsec', 1),
 ('gadgets', 1),
 ('gatekeeping', 1),
 ('anarchism', 1),
 ('askwomen', 1),
 ('cringeanarchy', 1),
 ('bestof', 1),
 ('justneckbeardthings', 1),
 ('vegan', 1),
 ('penis', 1),
 ('bacon', 1),
 ('equality', 1),
 ('egalitarian

In [8]:
for year in range(2005, 2010): 
    print(year, yearly_innovations[str(year)])
    print()

2005 ['gf', 'assholes', 'eachother', 'chodes', 'roomate', 'amogs', 'hypnotist', 'goofball', 'geek', 'cunt', 'fuckers', 'cunts', 'hippies', 'masseuse', 'enablers', 'oddball', 'newb', 'toastmasters', 'joker', 'narcissist', 'chubby chasers', 'freelancers', 'shaman', 'wackos', 'boyz', 'infj', 'server', 'keyboard jockeys', 'bloggers', 'noobie', 'prude', 'rapist', 'exes', 'shorty', 'gamer', 'brat', 'perverts', 'womanizer', 'dudes', 'girlfriends', 'smartass', 'hb2', 'stalker', 'stalkers', 'sluts', 'douche', 'noob', 'hb10', 'lifeguard', 'boyfriends', 'nerd', 'hotties', 'dumbass', 'weirdos', 'jerkoff', 'mods', 'moron', 'niceguy', 'barmaid', 'morons', 'jedi', 'wanker', 'selector', 'hb4', 'girlfriend', 'dude', 'slut', 'surfers', 'druggies', 'caveman', 'pornstar', 'hb7', 'bouncers', 'baristas', 'airhead', 'hb5', 'hb6', 'puas', 'misfits', 'hacker', 'toddler', 'paedophile', 'filmmaker', 'commenters', 'pua', 'hb9', 'wingmen', 'dork', 'nerds', 'doofus', 'cheerleader', 'pervert', 'amog', 'babysitter', 

In [9]:
innov_vocab = set()
for year in range(2007, 2020): 
    print(year, yearly_innovations[str(year)])
    innov_vocab.update(yearly_innovations[str(year)])
    print()

2007 ['millennials', 'normies', 'hikikomori', 'aspies', 'sahm', 'autists', 'wimmenz', 'beckys', 'wymyn', 'fucktoy', 'randos', 'larpers', 'neurotypicals', 'stacie', 'landwhale', 'hambeasts', 'weeaboo', 'womyns', 'people irl', 'incel', 'fembot', 'camwhore', 'sex bots', 'cosplayers', 'cum dumpsters', 'mgtow', 'globalists', 'old hags', 'masculist', 'equalist', 'sex bot', 'dravidian', 'friendo', 'radfems', 'terf', 'foid', 'camgirl', 'femnazis', 'xw', 'infiltrator', 'wageslave', 'cuckolds', 'femnazi', 'intactivists', 'homebody', 'antifa', 'aspie', 'hambeast', 'female supremacist', 'batterers', 'cum dumpster', 'mra', 'transpeople', 'filipinas', 'stepdaughter', 'assaulter', 'transwoman', 'sloot', 'sexbot', 'sexbots', 'baby daddies', 'manlet', 'influencer', 'larper', 'feminsts', 'mras', 'prepper', 'honey badgers', 'tyrones', 'transwomen', 'cuckold', 'womenz', 'batterer', 'camgirls', 'masculinist', 'cumdumpster', 'femails', 'faer']

2008 ['women irl', 'weebs', 'suffragists', 'rape apologists', '

## Analyze source and destinations of lexical innovations

In [10]:
print(len(innov_vocab))
with open(LOGS + 'lexical_innovations.txt', 'w') as outfile: 
    for w in innov_vocab: 
        outfile.write(w + '\n')

254


In [11]:
common_sources = Counter()
mano_source_words = set()
outside_source_words = set()
for w in word_births: 
    if w not in innov_vocab: continue
    content = word_births[w]
    date = content[0]
    year = date.split('-')[0]
    if set(sources) & mano_communities: 
        mano_source_words.add(w)
    else: 
        outside_source_words.add(w)
    sources = content[1]
    common_sources.update(sources)

In [12]:
common_sources.most_common(30)

[('reddit.com', 65),
 ('FORUM_red_pill_talk', 39),
 ('FORUM_the_attraction', 16),
 ('politics', 12),
 ('askreddit', 11),
 ('mensrights', 11),
 ('incels', 10),
 ('FORUM_pua_forum', 7),
 ('funny', 5),
 ('pics', 4),
 ('thebluepill', 4),
 ('science', 4),
 ('wtf', 4),
 ('drama', 4),
 ('programming', 4),
 ('shitredditsays', 4),
 ('the_donald', 3),
 ('atheism', 3),
 ('seduction', 3),
 ('entertainment', 3),
 ('theredpill', 3),
 ('business', 3),
 ('news', 2),
 ('FORUM_rooshv', 2),
 ('4chan4trump', 2),
 ('mgtow', 2),
 ('purplepilldebate', 2),
 ('askmen', 2),
 ('videos', 2),
 ('worldnews', 2)]

In [15]:
top_n_subreddits = set()
with open(DATA + 'all_reddit_post_counts/top_subreddits.txt', 'r') as infile: 
    line_count = 0
    for line in infile: 
        top_n_subreddits.add(line.strip().split(' ')[0])
        line_count += 1
        if line_count == 500: break
print(len(top_n_subreddits))
for source in common_sources: 
    if source.startswith('FORUM_'): continue
    if source not in top_n_subreddits and source not in mano_communities: 
        print(source)

500
antipozi
kotakuinaction
puahate
waluigi
wikipedia
oney
4chan4trump
badhistory
netsec
gatekeeping
anarchism
purplepilldebate
justneckbeardthings
drama
bacon
equality
egalitarian
cooking
computebazaar
buildapcsales
shitredditsays
transgender
subredditdrama
diablo
tumblrinaction
cuckold
dirtykik
feminisms
soccerspirits
nofapjuly
pfjerk
4chan
celebrities
meetup
gay
sidehugs
gaybros
fitnesscirclejerk
pets
cortexcommand
theoryofreddit


In [14]:
print(mano_source_words)
print()
print(outside_source_words)

{'normie scum', 'hambeast', 'transwomen', 'blackcel', 'faer', 'oldcel', 'foid', 'escortcels', 'brigaders', 'influencer', 'gymcel', 'chadpreet', 'fuckboy', 'poorcel', 'volcel', 'redpiller', 'looksmatches', 'larpers', 'chadcel', 'norwood reaper', 'roasties', 'cuckqueers', 'sexbot', 'chaddam', 'feminsts', 'suffragists', 'red piller', 'chadlet', 'locationcel', 'ricecels', 'itcels', 'femails', 'oldcels', 'mentalcel', 'cuckservatives', 'tradcuck', 'sub8s', 'beta bux', 'ethnicel', 'camwhore', 'heightcel', 'fakecels', 'intactivist', 'manlet', 'chadlite', 'wymyn', 'friendo', 'rapefugees', 'stepdaughter', 'manlets', 'richcel', 'turbomanlet', 'bluepiller', 'mentalcels', 'aspies', 'landwhale', 'manginas', 'edgelord', 'trper', 'looksmatch', 'srsers', 'sluthaters', 'cuckcel', 'femcel', 'youngcels', 'wahmyn', 'gigachads', 'youngcel', 'khhv', 'numales', 'landwhales', 'shitlord', 'manchildren', 'wristcel', 'noodlewhore', 'wahmen', 'weebs', 'terfs', 'sex bot', 'intactivists', 'sedditors', 'women irl', '