# Reddit Analysis

## Import and clean data

In [56]:
# imports
import pandas as pd
pd.options.mode.chained_assignment = None

import numpy as np
import matplotlib as matplot
import networkx as nx
import pickle

In [57]:
# import submissions
submissions_path = "data/initial/submissions_2008/"

submissions_dtype = {"id": "string", "url": "string", "permalink": "string", "author": "string", "created utc": int, "subreddit": "string", "subreddit id": "string", "num comments": int, "score": int, "over 18": bool, "distinguished": "string", "domain": "string", "stickied": bool, "locked": bool, "hide score": bool}
list_subs = []

for i in range(0,12):
    name = submissions_path + f"csv-{i}.csv"
    list_subs.append(pd.read_csv(name, dtype=submissions_dtype, index_col=0))
submissions = pd.concat(list_subs)

submissions

Unnamed: 0,id,url,permalink,author,created_utc,subreddit,subreddit_id,num_comments,score,over_18,distinguished,domain,stickied,locked,hide_score
0,648oo,http://www.ignorancedenied.com/viewthread.php?...,/r/reddit.com/comments/648oo/brain_disease_is_...,DITUS,1199145615,reddit.com,t5_6,1,0,False,,ignorancedenied.com,False,False,False
1,648op,http://www.flascience.org/wp/?p=363,/r/science/comments/648op/three_more_florida_c...,rmuser,1199145634,science,t5_mouw,5,20,False,,flascience.org,False,False,False
2,648oq,http://dlweinreb.wordpress.com/2007/12/31/obje...,/r/programming/comments/648oq/the_engineering_...,[deleted],1199145691,programming,t5_2fwo,0,0,False,,dlweinreb.wordpress.com,False,False,False
3,648or,http://hosted.ap.org/dynamic/stories/O/ODD_SHO...,/r/reddit.com/comments/648or/nude_couple_grapp...,zorno,1199145709,reddit.com,t5_6,1,3,False,,hosted.ap.org,False,False,False
4,648os,http://www.sltrib.com/opinion/ci_7846101?sourc...,/r/politics/comments/648os/apparently_bushs_pr...,rmuser,1199145735,politics,t5_2cneq,2,0,False,,sltrib.com,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283910,7mq3r,http://www.youtube.com/watch?v=gdQH1CI4LHY&amp...,/r/politics/comments/7mq3r/ron_paul_on_recent_...,middkidd,1230767963,politics,t5_2cneq,3,1,False,,youtube.com,False,False,False
283911,7mq3s,http://xs434.xs.to/xs434/08013/a312.jpg,/r/ass/comments/7mq3s/francesca/,[deleted],1230767963,ass,t5_2qoil,0,9,True,,xs434.xs.to,False,False,False
283912,7mq3t,http://unemployedguide.blogspot.com/2008/12/la...,/r/nsfw/comments/7mq3t/laura_ingrahams_embryo/,[deleted],1230767974,nsfw,t5_vf2,1,0,True,,unemployedguide.blogspot.com,False,False,False
283913,7mq3u,http://www.youtube.com/watch?v=AnGpJ3_MnYQ&amp...,/r/gaming/comments/7mq3u/splinter_cell_double_...,[deleted],1230767984,gaming,t5_2qh03,0,0,False,,youtube.com,False,False,False


In [60]:
# import comments
comments_path = "data/initial/comments_2008/"
comments_dtype = {"id": "string", "author": "string", "link id": "string", "parent id": "string", "created utc": int, "subreddit": "string", "subreddit id": "string", "score": int, "distinguished": "string", "gilded": int, "controversiality": int}
list_comments = []

for i in range(0,12):
    name = comments_path + f"csv-{i}.csv"
    list_comments.append(pd.read_csv(name, dtype=comments_dtype, index_col=0))
comments = pd.concat(list_comments)

comments

Unnamed: 0,id,author,link_id,parent_id,created_utc,subreddit,subreddit_id,score,distinguished,gilded,controversiality
0,c02s9s6,Haven,t3_648oh,t1_c02s9rv,1199145604,reddit.com,t5_6,4,,0,0
1,c02s9s7,[deleted],t3_647ht,t1_c02s8c8,1199145619,reddit.com,t5_6,2,,0,0
2,c02s9s8,lilmiss2,t3_648oh,t1_c02s9rv,1199145620,reddit.com,t5_6,2,,0,0
3,c02s9s9,[deleted],t3_648oo,t3_648oo,1199145623,reddit.com,t5_6,1,,0,0
4,c02s9sa,[deleted],t3_648et,t3_648et,1199145632,reddit.com,t5_6,1,,0,0
...,...,...,...,...,...,...,...,...,...,...,...
850354,c06vwuf,[deleted],t3_7k8lt,t1_c06vwmo,1229579675,Music,t5_2qh1u,12,,0,0
850355,c06vwug,Morgin_Black,t3_7k3w5,t3_7k3w5,1229579679,comics,t5_2qh0s,0,,0,0
850356,c06vwuh,[deleted],t3_7k31f,t1_c06vhkx,1229579680,Economics,t5_2qh1s,1,,0,0
850357,c06vwui,onezerozeroone,t3_7k2bc,t1_c06vrvz,1229579685,atheism,t5_2qh2p,1,,0,0


 ### Clean up
 
 Since the data has been imported, let's clean it up
 
 First let's check if there are any *null values* and are the submission id *unique* 

In [61]:
print("Submissions: ")
print("Shape size: ", submissions.shape[0])
print("ID size: ", submissions["id"].shape[0])
print("ID unique: ", submissions["id"].is_unique)
print("ID has null values: ", submissions["id"].isnull().values.any())
print("ID has missing values: ", submissions["id"].isna().values.any())

Submissions: 
Shape size:  2519853
ID size:  2519853
ID unique:  True
ID has null values:  False
ID has missing values:  False


Great, submissions seem to be clean. Lets check the comments now 

In [62]:
print("Comments: ")
print("Shape size: ", comments.shape[0])
print("ID size: ", comments["id"].shape[0])
print("ID unique: ", comments["id"].is_unique)
print("ID has null values: ", comments["id"].isnull().values.any())
print("ID has missing values: ", comments["id"].isna().values.any())

Comments: 
Shape size:  7242871
ID size:  7242871
ID unique:  True
ID has null values:  True
ID has missing values:  True


In [63]:
print("Number of null values: ", comments[comments["id"].isnull() == True])

Number of null values:           id     author   link_id parent_id  created_utc   subreddit  \
29975  <NA>  fuzzybunn  t3_7kef0    t1_k7p   1229661971  reddit.com   

      subreddit_id  score distinguished  gilded  controversiality  
29975         t5_6      1          <NA>       0                 0  


Since it is only 1/7M, we can drop it

In [64]:
comments.dropna(subset=['id'], inplace = True)

In [65]:
print("Comments: ")
print("Shape size: ", comments.shape[0])
print("ID size: ", comments["id"].shape[0])
print("ID unique: ", comments["id"].is_unique)
print("ID has null values: ", comments["id"].isnull().values.any())
print("ID has missing values: ", comments["id"].isna().values.any())

Comments: 
Shape size:  7242870
ID size:  7242870
ID unique:  True
ID has null values:  False
ID has missing values:  False


### On further inspection!

A good observation is that each subreddit must have a unique url. I will check that as well.

There seem to be authors that have been deleted. Their submissions and comment's should be deleted possibly


1. Unique URLs:

In [66]:
print("Submissions: ")
print("Shape size: ", submissions.shape[0])
print("URL size: ", submissions["url"].shape[0])
print("ID unique: ", submissions["id"].is_unique)
print("ID has null values: ", submissions["id"].isnull().values.any())
print("ID has missing values: ", submissions["id"].isna().values.any())

Submissions: 
Shape size:  2519853
URL size:  2519853
ID unique:  True
ID has null values:  False
ID has missing values:  False


2. Delete submissions and comments with who's author was deleted

In [67]:
submissions = submissions[submissions["author"] != "[deleted]"]
comments = comments[comments["author"] != "[deleted]"]

In [68]:
submissions

Unnamed: 0,id,url,permalink,author,created_utc,subreddit,subreddit_id,num_comments,score,over_18,distinguished,domain,stickied,locked,hide_score
0,648oo,http://www.ignorancedenied.com/viewthread.php?...,/r/reddit.com/comments/648oo/brain_disease_is_...,DITUS,1199145615,reddit.com,t5_6,1,0,False,,ignorancedenied.com,False,False,False
1,648op,http://www.flascience.org/wp/?p=363,/r/science/comments/648op/three_more_florida_c...,rmuser,1199145634,science,t5_mouw,5,20,False,,flascience.org,False,False,False
3,648or,http://hosted.ap.org/dynamic/stories/O/ODD_SHO...,/r/reddit.com/comments/648or/nude_couple_grapp...,zorno,1199145709,reddit.com,t5_6,1,3,False,,hosted.ap.org,False,False,False
4,648os,http://www.sltrib.com/opinion/ci_7846101?sourc...,/r/politics/comments/648os/apparently_bushs_pr...,rmuser,1199145735,politics,t5_2cneq,2,0,False,,sltrib.com,False,False,False
5,648ot,http://hosted.ap.org/dynamic/stories/O/ODD_RAR...,/r/reddit.com/comments/648ot/diners_find_rare_...,zorno,1199145735,reddit.com,t5_6,0,0,False,,hosted.ap.org,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283906,7mq3n,http://ventaboutsports.blogspot.com/2008/12/so...,/r/funny/comments/7mq3n/some_extremely_corny_j...,themightymidget,1230767909,funny,t5_2qh33,0,1,False,,ventaboutsports.blogspot.com,False,False,False
283907,7mq3o,http://www.pbs.org/mormons/etc/genealogy.html,/r/news/comments/7mq3o/pbs_looks_at_the_massiv...,Tom22,1230767926,news,t5_2qh3l,0,0,False,,pbs.org,False,False,False
283909,7mq3q,http://www.narutogames.biz,/r/reddit.com/comments/7mq3q/naruto_games/,bixiebix,1230767937,reddit.com,t5_6,7,1,False,,narutogames.biz,False,False,False
283910,7mq3r,http://www.youtube.com/watch?v=gdQH1CI4LHY&amp...,/r/politics/comments/7mq3r/ron_paul_on_recent_...,middkidd,1230767963,politics,t5_2cneq,3,1,False,,youtube.com,False,False,False


In [69]:
comments

Unnamed: 0,id,author,link_id,parent_id,created_utc,subreddit,subreddit_id,score,distinguished,gilded,controversiality
0,c02s9s6,Haven,t3_648oh,t1_c02s9rv,1199145604,reddit.com,t5_6,4,,0,0
2,c02s9s8,lilmiss2,t3_648oh,t1_c02s9rv,1199145620,reddit.com,t5_6,2,,0,0
6,c02s9sc,EverybodysAnAsshole,t3_648et,t1_c02s976,1199145644,reddit.com,t5_6,2,,0,0
7,c02s9sd,generalk,t3_647yd,t1_c02s8md,1199145647,programming,t5_2fwo,13,,0,0
8,c02s9se,seeker135,t3_6483n,t3_6483n,1199145650,politics,t5_2cneq,4,,0,0
...,...,...,...,...,...,...,...,...,...,...,...
850352,c06vwud,CommodoreGuff,t3_7k1l5,t1_c06vpzj,1229579674,programming,t5_2fwo,1,,0,0
850353,c06vwue,wolfzero,t3_7k4if,t1_c06vs7l,1229579675,technology,t5_2qh16,4,,0,0
850355,c06vwug,Morgin_Black,t3_7k3w5,t3_7k3w5,1229579679,comics,t5_2qh0s,0,,0,0
850357,c06vwui,onezerozeroone,t3_7k2bc,t1_c06vrvz,1229579685,atheism,t5_2qh2p,1,,0,0


Let's also rename ID for good practice

In [70]:
submissions.rename(columns={"id": "submissions_id"})
submissions.reset_index(drop=True, inplace=True)
submissions["id"] = submissions.index
submissions

Unnamed: 0,id,url,permalink,author,created_utc,subreddit,subreddit_id,num_comments,score,over_18,distinguished,domain,stickied,locked,hide_score
0,0,http://www.ignorancedenied.com/viewthread.php?...,/r/reddit.com/comments/648oo/brain_disease_is_...,DITUS,1199145615,reddit.com,t5_6,1,0,False,,ignorancedenied.com,False,False,False
1,1,http://www.flascience.org/wp/?p=363,/r/science/comments/648op/three_more_florida_c...,rmuser,1199145634,science,t5_mouw,5,20,False,,flascience.org,False,False,False
2,2,http://hosted.ap.org/dynamic/stories/O/ODD_SHO...,/r/reddit.com/comments/648or/nude_couple_grapp...,zorno,1199145709,reddit.com,t5_6,1,3,False,,hosted.ap.org,False,False,False
3,3,http://www.sltrib.com/opinion/ci_7846101?sourc...,/r/politics/comments/648os/apparently_bushs_pr...,rmuser,1199145735,politics,t5_2cneq,2,0,False,,sltrib.com,False,False,False
4,4,http://hosted.ap.org/dynamic/stories/O/ODD_RAR...,/r/reddit.com/comments/648ot/diners_find_rare_...,zorno,1199145735,reddit.com,t5_6,0,0,False,,hosted.ap.org,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2044805,2044805,http://ventaboutsports.blogspot.com/2008/12/so...,/r/funny/comments/7mq3n/some_extremely_corny_j...,themightymidget,1230767909,funny,t5_2qh33,0,1,False,,ventaboutsports.blogspot.com,False,False,False
2044806,2044806,http://www.pbs.org/mormons/etc/genealogy.html,/r/news/comments/7mq3o/pbs_looks_at_the_massiv...,Tom22,1230767926,news,t5_2qh3l,0,0,False,,pbs.org,False,False,False
2044807,2044807,http://www.narutogames.biz,/r/reddit.com/comments/7mq3q/naruto_games/,bixiebix,1230767937,reddit.com,t5_6,7,1,False,,narutogames.biz,False,False,False
2044808,2044808,http://www.youtube.com/watch?v=gdQH1CI4LHY&amp...,/r/politics/comments/7mq3r/ron_paul_on_recent_...,middkidd,1230767963,politics,t5_2cneq,3,1,False,,youtube.com,False,False,False


In [71]:
comments.rename(columns={"id": "comments_id"})
comments.reset_index(drop=True, inplace=True)
comments["id"] = comments.index
comments

Unnamed: 0,id,author,link_id,parent_id,created_utc,subreddit,subreddit_id,score,distinguished,gilded,controversiality
0,0,Haven,t3_648oh,t1_c02s9rv,1199145604,reddit.com,t5_6,4,,0,0
1,1,lilmiss2,t3_648oh,t1_c02s9rv,1199145620,reddit.com,t5_6,2,,0,0
2,2,EverybodysAnAsshole,t3_648et,t1_c02s976,1199145644,reddit.com,t5_6,2,,0,0
3,3,generalk,t3_647yd,t1_c02s8md,1199145647,programming,t5_2fwo,13,,0,0
4,4,seeker135,t3_6483n,t3_6483n,1199145650,politics,t5_2cneq,4,,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4873684,4873684,CommodoreGuff,t3_7k1l5,t1_c06vpzj,1229579674,programming,t5_2fwo,1,,0,0
4873685,4873685,wolfzero,t3_7k4if,t1_c06vs7l,1229579675,technology,t5_2qh16,4,,0,0
4873686,4873686,Morgin_Black,t3_7k3w5,t3_7k3w5,1229579679,comics,t5_2qh0s,0,,0,0
4873687,4873687,onezerozeroone,t3_7k2bc,t1_c06vrvz,1229579685,atheism,t5_2qh2p,1,,0,0


### Now to export cleaned data

In [72]:
with open("data/comments_cleaned", 'wb') as file:
    pickle.dump(comments, file)
    
with open("data/submissions_cleaned", 'wb') as file:
    pickle.dump(submissions, file)

> Check if the files are correct

In [74]:
with open("data/comments_cleaned", 'rb') as file:
    comments = pickle.load(file)
    
with open("data/submissions_cleaned", 'rb') as file:
    submissions = pickle.load(file)

In [75]:
submissions

Unnamed: 0,id,url,permalink,author,created_utc,subreddit,subreddit_id,num_comments,score,over_18,distinguished,domain,stickied,locked,hide_score
0,0,http://www.ignorancedenied.com/viewthread.php?...,/r/reddit.com/comments/648oo/brain_disease_is_...,DITUS,1199145615,reddit.com,t5_6,1,0,False,,ignorancedenied.com,False,False,False
1,1,http://www.flascience.org/wp/?p=363,/r/science/comments/648op/three_more_florida_c...,rmuser,1199145634,science,t5_mouw,5,20,False,,flascience.org,False,False,False
2,2,http://hosted.ap.org/dynamic/stories/O/ODD_SHO...,/r/reddit.com/comments/648or/nude_couple_grapp...,zorno,1199145709,reddit.com,t5_6,1,3,False,,hosted.ap.org,False,False,False
3,3,http://www.sltrib.com/opinion/ci_7846101?sourc...,/r/politics/comments/648os/apparently_bushs_pr...,rmuser,1199145735,politics,t5_2cneq,2,0,False,,sltrib.com,False,False,False
4,4,http://hosted.ap.org/dynamic/stories/O/ODD_RAR...,/r/reddit.com/comments/648ot/diners_find_rare_...,zorno,1199145735,reddit.com,t5_6,0,0,False,,hosted.ap.org,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2044805,2044805,http://ventaboutsports.blogspot.com/2008/12/so...,/r/funny/comments/7mq3n/some_extremely_corny_j...,themightymidget,1230767909,funny,t5_2qh33,0,1,False,,ventaboutsports.blogspot.com,False,False,False
2044806,2044806,http://www.pbs.org/mormons/etc/genealogy.html,/r/news/comments/7mq3o/pbs_looks_at_the_massiv...,Tom22,1230767926,news,t5_2qh3l,0,0,False,,pbs.org,False,False,False
2044807,2044807,http://www.narutogames.biz,/r/reddit.com/comments/7mq3q/naruto_games/,bixiebix,1230767937,reddit.com,t5_6,7,1,False,,narutogames.biz,False,False,False
2044808,2044808,http://www.youtube.com/watch?v=gdQH1CI4LHY&amp...,/r/politics/comments/7mq3r/ron_paul_on_recent_...,middkidd,1230767963,politics,t5_2cneq,3,1,False,,youtube.com,False,False,False


In [76]:
comments

Unnamed: 0,id,author,link_id,parent_id,created_utc,subreddit,subreddit_id,score,distinguished,gilded,controversiality
0,0,Haven,t3_648oh,t1_c02s9rv,1199145604,reddit.com,t5_6,4,,0,0
1,1,lilmiss2,t3_648oh,t1_c02s9rv,1199145620,reddit.com,t5_6,2,,0,0
2,2,EverybodysAnAsshole,t3_648et,t1_c02s976,1199145644,reddit.com,t5_6,2,,0,0
3,3,generalk,t3_647yd,t1_c02s8md,1199145647,programming,t5_2fwo,13,,0,0
4,4,seeker135,t3_6483n,t3_6483n,1199145650,politics,t5_2cneq,4,,0,0
...,...,...,...,...,...,...,...,...,...,...,...
4873684,4873684,CommodoreGuff,t3_7k1l5,t1_c06vpzj,1229579674,programming,t5_2fwo,1,,0,0
4873685,4873685,wolfzero,t3_7k4if,t1_c06vs7l,1229579675,technology,t5_2qh16,4,,0,0
4873686,4873686,Morgin_Black,t3_7k3w5,t3_7k3w5,1229579679,comics,t5_2qh0s,0,,0,0
4873687,4873687,onezerozeroone,t3_7k2bc,t1_c06vrvz,1229579685,atheism,t5_2qh2p,1,,0,0
