In [1]:
import numpy as np
import pandas as pd
import os
import sys
import spacy
import time

from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.metrics import classification_report
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.metrics import balanced_accuracy_score
from textblob import TextBlob



In [2]:
nostopwords_df = pd.read_csv('./data/preprocessed_stop_pos.csv')

In [3]:
# Change rows with no info to UNK or NAN for text and pos respectively.
new_df = set()
for row in nostopwords_df.itertuples():
    title = row.title
    title_pos = row.title_pos
    selftext = row.selftext
    selftext_pos = row.selftext_pos
    if type(row.title) == float:
        title = "UNK"
        title_pos = "NAN"
    if type(row.selftext) == float:
        selftext = "UNK"
        selftext_pos = "NAN"
    
    new_df.add((row.subreddit, title,title_pos, selftext,selftext_pos))

In [4]:
scrubbed = pd.DataFrame(new_df, columns=["subreddit", "title", "title_pos", "selftext", "selftext_pos"])

<h1> Featurizer </h1>

In [43]:
cuf_set = set()

def cuf(df):
    c = []
    count = 0
    for row in df.itertuples():
        # ------------------------- DATA ---------------------------
        title = row.title.split()
        body = row.selftext.split()
        t1=row.title_pos.split()
        t2=row.selftext_pos.split()
        
        s1 = list(map(lambda x: "title_" + x, row.title.split()))
        s2 = list(map(lambda x: "body_" + x, row.selftext.split()))
        feature_pos_title = list(map(lambda x: "title_pos_" + x, row.title_pos.split()))
        feature_pos_text = list(map(lambda x: "body_pos_" + x, row.selftext_pos.split()))
        
        # ---------------------- Baseline ---------------------------------
        baseline = s1 + s2
        baseline_string = " ".join(baseline)
        
        # ---------------------- FIRST POS == PUNCT ----------------------
        punct_ft = ""
        if feature_pos_title[0] == "title_pos_PUNCT":
            punct_ft += "firstcharispunct"
        
        # ---------------------- POS-TRIGRAMS & avg. word len -------------------
        pos_trigram_title = ""
        pos_trigram_body = ""

        for i in range(len(feature_pos_title[:-2])):
            pos_trigram = " " + "_".join(feature_pos_title[i:i+3])
            pos_trigram_title += pos_trigram
        
        #truncated_body = feature_pos_text[:50]
        #for i in range(len(truncated_body[:-2])):
        #    pos_trigram = " " + "_".join(feature_pos_text[i:i+3])
        #    pos_trigram_body += pos_trigram
        
        # ------------------- TextBlob Sentiment and Polarity ----------------------
        blob = False # Takes a long time, can't do every run.
        if blob == True:
            sent = TextBlob(" ".join(body))
            subjectivity_ft = ""
            polarity_ft = ""
            polarity = sent.sentiment.polarity
            subjectivity = sent.sentiment.subjectivity

            if polarity >= 0.65: polarity_ft += " polaritysixtyfivepercent"
            elif polarity <= -0.60: polarity_ft += " polarityreallylow"

            if subjectivity >= 0.80: subjectivity_ft += " subjectivityseventyfivepercent"
            elif subjectivity <= 0.20: subjectivity_ft += " subjectivitylesthantwentyfivepercent"

        
        # ----------------------- Avg word length -------------------
        average_word_length_title = sum(len(word)-6 for word in s1) / len(s1)
        average_word_length_body = sum(len(word)-5 for word in s2) / len(s2)
        avg_len_title_string = ""
        avg_len_body_string = ""
        avg_len_title_ft = ""
        avg_len_body_ft = ""
        
        if average_word_length_title >= 7: avg_len_title_string += " avg_wordlentitleseven"
        elif average_word_length_title >= 6: avg_len_title_string += " avg_wordlentitlesix"
        elif average_word_length_title >= 5: avg_len_title_string += " avg_wordlentitlefive"
        elif average_word_length_title >= 4: avg_len_title_string += " avg_wordlentitlefour"
        elif average_word_length_title >= 3: avg_len_title_string += " avg_wordlentitlethree"
        elif average_word_length_title >= 2: avg_len_title_string += " avg_wordlentitletwo"
            
        for i in range(1):
            avg_len_title_ft += avg_len_title_string

        if average_word_length_body >= 7: avg_len_body_string += " avg_wordlenbodyeseven"
        elif average_word_length_body >= 6: avg_len_body_string += " avg_wordlenbodyesix"
        elif average_word_length_body >= 5: avg_len_body_string += " avg_wordlenbodyfive"
        elif average_word_length_body >= 4: avg_len_body_string += " avg_wordlenbodyfour"
        elif average_word_length_body >= 3: avg_len_body_string += " avg_wordlenbodythree"
        elif average_word_length_body >= 2: avg_len_body_string += " avg_wordlenbodytwo"
            
        for i in range(1):
            avg_len_body_ft += avg_len_body_string

        # ----------------- Symbol feature ---------------------
        symbol_ft = ""
        if feature_pos_title.count("title_pos_SYM") >= 4:
            for i in range(1):
                symbol_ft += " SYMBOL_EXIST"
                
        if feature_pos_text.count("body_pos_SPACE") >= 4:
            for i in range(1):
                symbol_ft += " SPACE_EXIST"
                
        # ----------------- Nominalkvot -----------------
        noun_verb_ratio = ""
        nouns = 0
        verbs = 0
        for pos in row.selftext_pos.split():
            if pos == "VERB" or pos == "ADV" or pos == "PRON":
                verbs += 1
            elif pos == "NOUN" or pos == "ADP":
                nouns += 1
                
        if verbs > 0: measure = nouns / verbs
        else: measure = 0.5
            
        measure_feature = ""
        if measure >= 4:   measure_feature = " noun_verb_ratio_four"
        elif measure >= 3:   measure_feature = " noun_verb_ratio_three"
        elif measure >= 1: measure_feature = " noun_verb_ratio_one"
        elif measure < 0.5:              measure_feature = " noun_verb_ratio_lessthanzeropointfive"
        for i in range(1):
            noun_verb_ratio += measure_feature
        
        # ------------ TITLE AND SELFTEXT LENGTH FEATURE -----------------
        selftext_len_ft_string = ""
        selftext_len = ""
        title_len_ft_string = ""
        title_len = ""
        if len(row.selftext.split()) >= 700: selftext_len_ft_string += " sevenhundred_selftext_len"
        elif len(row.selftext.split()) >= 500: selftext_len_ft_string += " fivehundred_selftext_len"
        elif len(row.selftext.split()) >= 300: selftext_len_ft_string += " threehundred_selftext_len"
        elif len(row.selftext.split()) >= 100: selftext_len_ft_string += " hundred_selftext_len"
        elif len(row.selftext.split()) >= 50: selftext_len_ft_string += " fifty_selftext_len"
        elif len(row.selftext.split()) < 50: selftext_len_ft_string += " less_than_five_wordsbody"
        for i in range(1):
            selftext_len += selftext_len_ft_string

        if len(row.title.split()) > 28: title_len_ft_string += "title_morethantwentyeight"
        elif len(row.title.split()) > 15: title_len_ft_string += "title_morethanfifteen"
        elif len(row.title.split()) < 5: title_len_ft_string += "title_lessthanfive"
        for i in range(1):
            title_len += title_len_ft_string

        # -- Add to set ---
        ft = "" # FEATURE STRING
        ft += baseline_string
        #print(ft)
        #ft += avg_len_title_ft
        #ft += avg_len_body_ft
        #ft += title_bigram
        #ft += pos_trigram_title
        #ft += pos_trigram_body
        #ft += symbol_ft
        #ft += noun_verb_ratio
        #ft += selftext_len
        #ft += title_len
        #ft += subjectivity_ft
        #ft += polarity_ft
        #ft += punct_ft
        cuf_set.add((row.subreddit, ft))


In [44]:
cuf(scrubbed)
cuf_df = pd.DataFrame(cuf_set, columns=["subreddit", "body"])
cuf_set = set()

In [53]:
train_cut, test_cut = train_test_split(cuf_df, test_size=0.2, shuffle=True, random_state = 12)

In [55]:
# Cross Unigram (CU)
start = time.time()
#--------------------
cu_pipe = Pipeline([
    ("tf", TfidfVectorizer(min_df=1, max_features = None, ngram_range=(1,1))),
    ("sel", SelectKBest(chi2, k=100000)),
    #('vect', HashingVectorizer(binary=True)),
    ('mnb', MultinomialNB()),
])
cu_pipe.fit(train_cut.body, train_cut.subreddit)
#--------------------
end = time.time()
print(end - start, "seconds")

682.8519630432129 seconds


In [56]:
start = time.time()
#--------------------
pred = cu_pipe.predict(test_cut.body)
#--------------------
end = time.time()
print(end - start, "seconds")

27.382012128829956 seconds


In [57]:
score = precision_recall_fscore_support(test_cut.subreddit, pred, average='weighted')
print(score)

# -- Precision @X ---
tf_idf_transform = cu_pipe[0].transform(test_cut.body)
k_best = cu_pipe[1].transform(tf_idf_transform) # using chi2
test_pred_proba = cu_pipe[2].predict_proba(k_best) # use (tf_idf_transform without chi2)

(0.7826107602749512, 0.7425370187561698, 0.7467698708575711, None)


In [58]:
def precision_at_k(y_true, y_pred, k=5):
    top_n_predictions = np.argsort(y_pred, axis=1)[:, -k:]
    c = cu_pipe[2].classes_ # index 1 without chi2
    y_true = np.array(y_true)
    precision = 0
    for i, prediction in enumerate(top_n_predictions):
        for pred_index in prediction:
            if c[pred_index] == y_true[i]:
                precision += 1
                break
    
    return precision / len(y_true)

In [59]:
print('precision@1 =', precision_at_k(test_cut.subreddit, test_pred_proba, 1))
print('precision@3 =', precision_at_k(test_cut.subreddit, test_pred_proba, 3))
print('precision@5 =', precision_at_k(test_cut.subreddit, test_pred_proba, 5))


precision@1 = 0.7425370187561698
precision@3 = 0.8626999012833169
precision@5 = 0.8957453109575518


In [None]:
print(classification_report(test_cut.subreddit, pred))