# ***Part 2: predict the hashtag/topic/subreddit of a post based on its textual content***

# Importing data & visualization

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd

RUNNING_KAGGLE_KERNEL = True 

In [None]:
rspct_path = "/content/drive/MyDrive/Senior Project/rspct.tsv"
info_path = "/content/drive/MyDrive/Senior Project/subreddit_info.csv"
rspct_df = pd.read_csv(rspct_path, sep='\t')
info_df = pd.read_csv(info_path)

In [None]:
rspct_df.head(5)

Unnamed: 0,id,subreddit,title,selftext
0,6d8knd,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi..."
1,58mbft,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...
2,8f73s7,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...
3,6ti6re,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...
4,77sxto,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,..."


In [None]:

info_df = info_df[info_df.in_data].reset_index()
info_df.head(5)

Unnamed: 0,index,subreddit,category_1,category_2,category_3,in_data,reason_for_exclusion
0,0,whatsthatbook,advice/question,book,,True,
1,25,theydidthemath,advice/question,calculations,,True,
2,26,datarecovery,advice/question,data recovery,,True,
3,27,declutter,advice/question,declutter,,True,
4,30,productivity,advice/question,discipline,,True,


In [None]:

def join_text(row):
    if RUNNING_KAGGLE_KERNEL:
        return row['title'][:100] + " " + row['selftext'][:512]
    else:
        return row['title'] + " " + row['selftext']

rspct_df['text'] = rspct_df[['title', 'selftext']].apply(join_text, axis=1)

# Train-Test Split

In [None]:


train_split_index = int(len(rspct_df) * 0.8)

train_df, test_df = rspct_df[:train_split_index], rspct_df[train_split_index:]
X_train , X_test  = train_df.text, test_df.text
y_train, y_test   = train_df.subreddit, test_df.subreddit

# Bag of Words

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y_train)
y_train = le.transform(y_train)
y_test  = le.transform(y_test)

y_train[:5]

array([920, 931, 161, 827, 669])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# extract features from text using bag-of-words (single words + bigrams)

print('this cell will take about 10 minutes to run')

NUM_FEATURES = 30000 if RUNNING_KAGGLE_KERNEL else 100000

tf_idf_vectorizer = TfidfVectorizer(max_features = NUM_FEATURES,
                                min_df=5,
                                ngram_range=(1,2),
                                stop_words=None,
                                token_pattern='(?u)\\b\\w+\\b',
                            )

X_train = tf_idf_vectorizer.fit_transform(X_train)
X_test  = tf_idf_vectorizer.transform(X_test)

from sklearn.feature_selection import chi2, SelectKBest

# if we have more memory, select top 100000 features and select good features
if not RUNNING_KAGGLE_KERNEL:
    chi2_selector = SelectKBest(chi2, 30000)

    chi2_selector.fit(X_train, y_train) 

    X_train = chi2_selector.transform(X_train)
    X_test  = chi2_selector.transform(X_test)

X_train.shape, X_test.shape

this cell will take about 10 minutes to run


((810400, 30000), (202600, 30000))

# Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import MultinomialNB

# train a naive bayes model, get predictions

nb_model = MultinomialNB(alpha=0.1)
nb_model.fit(X_train, y_train)

y_pred_proba = nb_model.predict_proba(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)

In [None]:
# we use precision-at-k metrics to evaluate performance

def precision_at_k(y_true, y_pred, k=5):
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_pred = np.argsort(y_pred, axis=1)
    y_pred = y_pred[:, ::-1][:, :k]
    arr = [y in s for y, s in zip(y_true, y_pred)]
    return np.mean(arr)

print('precision@1 =', np.mean(y_test == y_pred))
print('precision@3 =', precision_at_k(y_test, y_pred_proba, 3))
print('precision@5 =', precision_at_k(y_test, y_pred_proba, 5))

precision@1 = 0.615187561697927
precision@3 = 0.7615399802566634
precision@5 = 0.8105972359328727
