In [None]:
# importing libraries for data loading and visualization

import pandas as pd
import numpy as np
import sqlite3
import matplotlib.pyplot as plt

pd.options.display.max_rows = 1000

In [None]:
# loading reddit may 2015 comment data from kaggle sql database

sql_conn = sqlite3.connect("data/database.sqlite")

all_comments = pd.read_sql("SELECT author, subreddit, body FROM May2015", sql_conn)

In [None]:
# creating new column for comment length

comment_length = []

for row in all_comments['body']:
    if pd.isnull(row):
        comment_length.append(0)
    else:
        comment_length.append(len(row))
    
all_comments['comment_length'] = comment_length

In [None]:
print("Total comments: ", all_comments.shape[0])
print("Total subreddits: ", all_comments['subreddit'].nunique())
print("Unique authors: ", all_comments['author'].nunique())
print("Average comment length: ", all_comments['comment_length'].mean())

In [None]:
# creating subset of comments for r/nfl

sub = ['nfl']

nfl_comments = all_comments.loc[all_comments['subreddit'].isin(sub)]

print("Total r/nfl comments: ", nfl_comments.shape[0])
print("Unique r/nfl authors: ", nfl_comments['author'].nunique())
print("Average r/nfl comment length: ", nfl_comments['comment_length'].mean())

In [None]:
# identifying the top 10 authors for modeling, excluding deleted/mod/bot accounts

exclusion_list = [
    '[deleted]',
    'AutoModerator'
]

nfl_top_10_authors = nfl_comments['author'].loc[~nfl_comments['author'].isin(exclusion_list)].value_counts()[:10].index.values

print("Top 10 authors: ", nfl_top_10_authors)

In [None]:
# looking at average comment length and number of comments by author

nfl_comments_top_10_all = nfl_comments.loc[nfl_comments['author'].isin(nfl_top_10_authors)].drop_duplicates(subset = ["author", "body"])

nfl_top_comments_length = nfl_comments_top_10_all.groupby('author', as_index=False)['comment_length'].mean().sort_values('comment_length', ascending = False)
nfl_top_comments_count = nfl_comments_top_10_all['author'].value_counts()

plt.figure()
nfl_top_comments_length.plot.bar(x = 'author', y = 'comment_length', figsize=(20,10))
plt.figure()
nfl_top_comments_count.plot.bar(x = 'author', y = 'comments', figsize=(20,10))
plt.show()

In [None]:
# sampling comments to get an even number for each author to avoid bias

nfl_comments_top_10_sampled = nfl_comments_top_10_all.groupby('author', group_keys=False).apply(lambda nfl_comments_top_10_all: nfl_comments_top_10_all.sample(1000))
nfl_comments_top_10_sampled.to_csv('data/nfl_comments_top_10_sampled.csv', columns = ['author','body'], index = False)

In [None]:
# creating subset of comments for r/nba

sub = ['nba']

nba_comments = all_comments.loc[all_comments['subreddit'].isin(sub)]

print("Total r/nba comments: ", nba_comments.shape[0])
print("Unique r/nba authors: ", nba_comments['author'].nunique())
print("Average r/nba comment length: ", nba_comments['comment_length'].mean())

In [None]:
# identifying the top 10 authors for modeling, excluding deleted/mod/bot accounts

exclusion_list = [
    '[deleted]',
    'AutoModerator'
]

nba_top_10_authors = nba_comments['author'].loc[~nba_comments['author'].isin(exclusion_list)].value_counts()[:10].index.values

print("Top 10 authors: ", nba_top_10_authors)

In [None]:
# looking at average comment length and number of comments by author

nba_comments_top_10_all = nba_comments.loc[nba_comments['author'].isin(nba_top_10_authors)].drop_duplicates(subset = ["author", "body"])

print(np.shape(nba_comments_top_10_all))

nba_comments_top_10_all = nba_comments_top_10_all.dropna(axis=1, how='any')

print(np.shape(nba_comments_top_10_all))

nba_top_comments_length = nba_comments_top_10_all.groupby('author', as_index=False)['comment_length'].mean().sort_values('comment_length', ascending = False)
nba_top_comments_count = nba_comments_top_10_all['author'].value_counts()

plt.figure()
nba_top_comments_length.plot.bar(x = 'author', y = 'comment_length', figsize=(20,10))
plt.figure()
nba_top_comments_count.plot.bar(x = 'author', y = 'comments', figsize=(20,10))
plt.show()

In [None]:
# sampling comments to get an even number for each author to avoid bias

nba_comments_top_10_sampled = nba_comments_top_10_all.groupby('author', group_keys=False).apply(lambda nba_comments_top_10_all: nba_comments_top_10_all.sample(1300))
nba_comments_top_10_sampled.to_csv('data/nba_comments_top_10_sampled.csv', columns = ['author','body'], index = False)