In [3]:
# import libraries
import pandas as pd
import nltk
from nltk import tokenize, stem, corpus

In [4]:
# load dataset with reddit posts
df = pd.read_csv('rspct.tsv', sep = '\t')

# return number of datapoints and number of classes
print(f"There are {df.shape[0]} data points.")
print(f"There are {df['subreddit'].nunique()} classes.")

There are 1013000 data points.
There are 1013 classes.


In [5]:
# select number of subrreddits to train on
n = 25 # number was decided based on hardware constraints
n_classes = df.subreddit.unique()[:n].tolist()

# filter initial dataset to first n classes
df = df[df['subreddit'].isin(n_classes)]

# return number of datapoints and number of classes
print(f"There are {df.shape[0]} data points.")
print(f"There are {df['subreddit'].nunique()} classes.")

There are 25000 data points.
There are 25 classes.


In [6]:
# seperate out training and testing sets
df_train = df.groupby('subreddit').sample(frac=0.8)
df_test = df[~df.isin(df_train)].dropna(how = 'all')

# return number of data points and number of classes for test and training
print(f"There are {df_train.shape[0]} data points in the training set")
print(f"There are {df_train['subreddit'].nunique()} classes in the training set")

print(f"There are {df_test.shape[0]} data points in the test set")
print(f"There are {df_test['subreddit'].nunique()} classes in the test set")

There are 20000 data points in the training set
There are 25 classes in the training set
There are 5000 data points in the test set
There are 25 classes in the test set


In [7]:
# pre-processing the text

# remove all lowercase from reddit posts and titles
df_train['selftext'] = df_train.selftext.apply(lambda x: x.lower())
df_test['selftext'] = df_test.selftext.apply(lambda x: x.lower())
df_train['title'] = df_train.title.apply(lambda x: x.lower())
df_test['title'] = df_test.title.apply(lambda x: x.lower())

# remove all html tags
df_train['selftext'] = df_train['selftext'].replace(r'<.*?>', '', regex=True)
df_test['selftext'] = df_test['selftext'].replace(r'<.*?>', '', regex=True)

# remove all special characters, including punctuation
df_train['selftext'] = df_train['selftext'].replace(r'[^\w\s]|_', '', regex=True)
df_test['selftext'] = df_test['selftext'].replace(r'[^\w\s]|_', '', regex=True)
df_train['title'] = df_train['title'].replace(r'[^\w\s]|_', '', regex=True)
df_test['title'] = df_test['title'].replace(r'[^\w\s]|_', '', regex=True)

# remove all URLs
df_train['selftext'] = df_train['selftext'].replace(r'http\S+|www.\.\S+', '', regex=True)
df_test['selftext'] = df_test['selftext'].replace(r'http\S+|www.\.\S+', '', regex=True)

# strip all stop words (e.g. the, and, or)
df_train['selftext'] = df_train['selftext'].replace(r'(\s*\b(?:a|an|and|are|as|at|be|but|by|for|if|in|into|is|it|no|not|of|on|or|such|that|the|their|then|there|these|they|this|to|was|will|with|my|oh|i|were|werent|was|wasnt|do|does))\b', '', regex=True)
df_test['selftext'] = df_test['selftext'].replace(r'(\s*\b(?:a|an|and|are|as|at|be|but|by|for|if|in|into|is|it|no|not|of|on|or|such|that|the|their|then|there|these|they|this|to|was|will|with|my|oh|i|were|werent|was|wasnt|do|does))\b', '', regex=True)
df_train['title'] = df_train['title'].replace(r'(\s*\b(?:a|an|and|are|as|at|be|but|by|for|if|in|into|is|it|no|not|of|on|or|such|that|the|their|then|there|these|they|this|to|was|will|with|my|oh|i|were|werent|was|wasnt|do|does))\b', '', regex=True)
df_test['title'] = df_test['title'].replace(r'(\s*\b(?:a|an|and|are|as|at|be|but|by|for|if|in|into|is|it|no|not|of|on|or|such|that|the|their|then|there|these|they|this|to|was|will|with|my|oh|i|were|werent|was|wasnt|do|does))\b', '', regex=True)

# remove usernames and words with digits
df_train['selftext'] = df_train['selftext'].replace(r'\s\w*\d+\w*', '', regex=True)
df_test['selftext'] = df_test['selftext'].replace(r'\s\w*\d+\w*', '', regex=True)
df_train['title'] = df_train['title'].replace(r'\s\w*\d+\w*', '', regex=True)
df_test['title'] = df_test['title'].replace(r'\s\w*\d+\w*', '', regex=True)

# remove extra spaces
df_train['selftext'] = df_train['selftext'].replace(r'\s\s+', ' ', regex=True)
df_test['selftext'] = df_test['selftext'].replace(r'\s\s+', ' ', regex=True)

In [8]:
# initialize Lemmatizer object
lemmatizer = stem.WordNetLemmatizer()

# Get the POS Tag for lemmatization (reference: https://medium.com/@yashj302/lemmatization-f134b3089429 )
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].lower()
    tag_dict = {'j': corpus.wordnet.ADJ,
                'n': corpus.wordnet.NOUN,
                'v': corpus.wordnet.VERB,
                'r': corpus.wordnet.ADV}
    return tag_dict.get(tag, corpus.wordnet.NOUN)

# apply lemmatizer on the text and title columns
df_train['selftext'] = df_train.selftext.apply(lambda x: ' '.join([lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in tokenize.word_tokenize(x)]))
df_test['selftext'] = df_test.selftext.apply(lambda x: ' '.join([lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in tokenize.word_tokenize(x)]))
df_train['title'] = df_train.title.apply(lambda x: ' '.join([lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in tokenize.word_tokenize(x)]))
df_test['title'] = df_test.title.apply(lambda x: ' '.join([lemmatizer.lemmatize(y, get_wordnet_pos(y)) for y in tokenize.word_tokenize(x)]))

In [None]:
# save results of lemmatization for future work
df_train.to_csv("training_set_milestone.csv")
df_test.to_csv("test_set_milestone.csv")