In [1]:
import os
import re
import string
from collections import Counter
from collections import defaultdict
from datetime import datetime
from string import punctuation
from urllib.parse import urlparse

import networkx as nx
import nltk
import pandas as pd
import matplotlib
import seaborn as sns
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.util import ngrams

In [2]:
%matplotlib inline
matplotlib.rcParams['figure.max_open_warning'] = 1000
matplotlib.rcParams['lines.linewidth'] = 1.0

pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [3]:
def my_ngrams(tokens, rg):
    ngrms = []
    for i in range(rg[0], rg[1] + 1):
        ngrms_aux = [ngrm for ngrm in ngrams(tokens, i)]
        ngrms.extend(ngrms_aux)
    return ngrms

# Preparing data

## Selected keywords

In [4]:
original_keywords = ["Picard", "Kirk", "Sulu", "Uhura", "Spock", "McCoy", "Bones", "Scotty", "Chekhov", "Crusher", "Nurse Chapel", "Sarek", "Nero", "Khan", "Data", "Pike", "Star Trek", "Klingon", "Vulcan", "Romulan", "Star fleet", "Starship Enterprise", "Delta Vega", "Earth", "Orion", "Romulan Narada", "Stardate", "Transporter beam", "Beam me up,\? Scotty", "Kobayashi Maru", "Space,\? The Final Frontier", "Energize", "Vulcan Salute", "Prime Directive", "Live long and prosper", "LLAP", "I'm a doctor,\? not a", "KHAAA\+N", "When you eliminate the impossible,\? whatever remains,\? however improbable,\? must be the truth", "Without followers,\? evil cannot spread", "The needs of the many outweigh the needs of the few", "Highly illogical", "to boldly go where no man has gone before", "I'm giving her all she's got,\? Captain", "Nuclear wessels", "Set phasers to stun", "Resistance is futile", "I have been and always shall be your friend"]

In [5]:
print(original_keywords)

['Picard', 'Kirk', 'Sulu', 'Uhura', 'Spock', 'McCoy', 'Bones', 'Scotty', 'Chekhov', 'Crusher', 'Nurse Chapel', 'Sarek', 'Nero', 'Khan', 'Data', 'Pike', 'Star Trek', 'Klingon', 'Vulcan', 'Romulan', 'Star fleet', 'Starship Enterprise', 'Delta Vega', 'Earth', 'Orion', 'Romulan Narada', 'Stardate', 'Transporter beam', 'Beam me up,\\? Scotty', 'Kobayashi Maru', 'Space,\\? The Final Frontier', 'Energize', 'Vulcan Salute', 'Prime Directive', 'Live long and prosper', 'LLAP', "I'm a doctor,\\? not a", 'KHAAA\\+N', 'When you eliminate the impossible,\\? whatever remains,\\? however improbable,\\? must be the truth', 'Without followers,\\? evil cannot spread', 'The needs of the many outweigh the needs of the few', 'Highly illogical', 'to boldly go where no man has gone before', "I'm giving her all she's got,\\? Captain", 'Nuclear wessels', 'Set phasers to stun', 'Resistance is futile', 'I have been and always shall be your friend']


In [6]:
ambiguous_keywords = [
    'Bones',
    'Khan',
    'Data',
    'Earth',
    'Energize',
]

In [7]:
keywords = list(set(original_keywords).difference(ambiguous_keywords))
keywords = [w.replace('\\', '').lower() for w in keywords]
keywords.sort()

In [8]:
print(keywords)

['beam me up,? scotty', 'chekhov', 'crusher', 'delta vega', 'highly illogical', 'i have been and always shall be your friend', "i'm a doctor,? not a", "i'm giving her all she's got,? captain", 'khaaa+n', 'kirk', 'klingon', 'kobayashi maru', 'live long and prosper', 'llap', 'mccoy', 'nero', 'nuclear wessels', 'nurse chapel', 'orion', 'picard', 'pike', 'prime directive', 'resistance is futile', 'romulan', 'romulan narada', 'sarek', 'scotty', 'set phasers to stun', 'space,? the final frontier', 'spock', 'star fleet', 'star trek', 'stardate', 'starship enterprise', 'sulu', 'the needs of the many outweigh the needs of the few', 'to boldly go where no man has gone before', 'transporter beam', 'uhura', 'vulcan', 'vulcan salute', 'when you eliminate the impossible,? whatever remains,? however improbable,? must be the truth', 'without followers,? evil cannot spread']


## Comments

In [9]:
%%time

dtypes = {
    'author': str,
    'body': str,
    'controversiality': int,
    'created_utc': pd.tslib.Timestamp,
    'distinguished': str,
    'downs': str,
    'gilded': str,
    'id': str,
    'name': str,
    'parent_id': str,
    'score': int,
    'subreddit': str,
    'ups': str,
    'month': pd.tslib.Timestamp,
    'year': pd.tslib.Timestamp,
}

reddit_df = pd.read_csv('user_network.csv', header=0, dtype=dtypes, parse_dates=['created_utc', 'month', 'year'])
reddit_df = reddit_df.fillna(0)

CPU times: user 3.92 s, sys: 95.4 ms, total: 4.01 s
Wall time: 4.01 s


In [10]:
len(reddit_df)

358408

In [11]:
reddit_df.head()

Unnamed: 0,author,body,controversiality,created_utc,distinguished,downs,gilded,id,name,parent_id,score,subreddit,ups,month,year
0,KineticSolution,"what does ""ymmv"" stand for? - - regarding the...",0,2013-05-01 14:16:52,0,0.0,0,c9qftjc,t1_c9qftjc,t1_c9qflks,0,Conservative,0,2013-05-01,2013-01-01
1,[deleted],your mileage may vary. - - you don't get it. ...,0,2013-05-01 14:31:20,0,0.0,0,c9qg631,t1_c9qg631,t1_c9qftjc,1,Conservative,1,2013-05-01,2013-01-01
2,gunner1868,where'd you find this teaser? curious to watch...,0,2014-07-07 22:00:34,0,0.0,0,cir7sp2,t1_cir7sp2,t1_cir37o7,1,pike,1,2014-07-01,2014-01-01
3,LostVikingC,https://www.youtube.com/watch?v=ifjsx-7zwkm,0,2014-07-08 10:35:30,0,0.0,0,cirld22,t1_cirld22,t1_cir7sp2,1,pike,1,2014-07-01,2014-01-01
4,jimmysilverrims,in one instance a culture was about to be dest...,0,2013-05-11 01:17:50,0,0.0,0,c9wm86e,t1_c9wm86e,t1_c9wm2pu,1,DaystromInstitute,1,2013-05-01,2013-01-01


# Cleaning data

In [12]:
reddit_df.columns

Index(['author', 'body', 'controversiality', 'created_utc', 'distinguished',
       'downs', 'gilded', 'id', 'name', 'parent_id', 'score', 'subreddit',
       'ups', 'month', 'year'],
      dtype='object')

In [13]:
reddit_df.drop_duplicates(
    ['author', 'body', 'controversiality', 'created_utc', 'distinguished',
     'gilded', 'id', 'name', 'parent_id', 'subreddit', 'month', 'year'],
    inplace=True
)

In [14]:
len(reddit_df)

358344

In [15]:
reddit_df = reddit_df[reddit_df['author'] != '[deleted]']

In [16]:
len(reddit_df)

338857

# Analyzing data

In [17]:
graph = nx.DiGraph()

In [18]:
%%time

unique_names = reddit_df['name'].unique()

for i, x in reddit_df.iterrows():
    pid = x['parent_id']
    if pid in unique_names:
        user = x['author']
        aux_df = reddit_df[reddit_df['name'] == pid]
        aux_sr = aux_df.iloc[0]
        p_user = aux_sr['author']
        if user != p_user:
            if not graph.has_edge(user, p_user):
                graph.add_edge(user, p_user, weight=1)
            else:
                graph[user][p_user]['weight'] += 1

CPU times: user 2h 40min 2s, sys: 7.71 s, total: 2h 40min 9s
Wall time: 2h 40min 6s


In [19]:
graph.number_of_nodes()

136548

In [20]:
graph.number_of_edges()

182127

# Saving data

In [21]:
nx.write_gexf(graph, 'user_graph.gexf')