In [1]:
import numpy as np
import pandas as pd
import pickle

from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

from utils import *

data = load_csv()
data.head()

Unnamed: 0_level_0,url,title,body,comments,descriptor,flair
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
g1zi21,https://www.reddit.com/r/india/comments/g1zi21...,Coronavirus (COVID-19) Megathread - News and U...,###[Covid-19 Fundraisers & Donation Links](htt...,###[Covid-19 Fundraisers & Donation Links](ht...,Coronavirus (COVID-19) Megathread - News and U...,Coronavirus
g4d2ix,https://www.reddit.com/r/india/comments/g4d2ix...,"[Monthly Happiness Thread] Randians, please sh...",<3 \n \nLinks: ...,Working from home since past one month. For s...,"[Monthly Happiness Thread] Randians, please sh...",Other
g79auq,https://i.redd.it/5b2bepewzru41.jpg,Greetings from Saudi Arabia. During this lock ...,,"Not much of a expert in this, but looks good!...",Greetings from Saudi Arabia. During this lock ...,Food
g7b3un,https://www.reddit.com/r/india/comments/g7b3un...,Covid19 hit me. I just got fired.,Just one call. We are having trouble with mone...,If you were doing good for yourself it wont b...,Covid19 hit me. I just got fired. Just one cal...,Coronavirus
g77k1z,https://www.ndtv.com/india-news/coronavirus-pm...,PM CARES fund won’t be checked by government’s...,,No transparency at all from a government whic...,PM CARES fund won’t be checked by government’s...,Politics


In [2]:
processed = preprocess_docs(data['descriptor'])
print('Before processsing: ', data['descriptor'][0][:80])
print('After processsing: ', processed[0][:8])

Before processsing:  Coronavirus (COVID-19) Megathread - News and Updates - 4 ###[Covid-19 Fundraiser
After processsing:  ['coronavirus', 'covid', 'megathread', 'news', 'and', 'updates', 'covid', 'fundraisers']


In [3]:
corpus = [TaggedDocument(tokens, [i]) for (i, tokens) in enumerate(processed)]

embedding_model = Doc2Vec(
    corpus,
    vector_size=EMBEDDING_SIZE,
    min_count=1,
    workers=4
)

embedding_model.train(corpus, total_examples=len(corpus), epochs=100)
embedding_model.save('models/embedding_model.pkl')

embeddings = docs_into_vectors(processed, embedding_model)
print(embeddings.shape)

(2962, 150)


In [4]:
labels = data['flair']

In [5]:
x_train, x_test, y_train, y_test = train_test_split(
    embeddings, labels,
    test_size=0.2,
    stratify=labels,
    random_state=7
)

In [8]:
model = LogisticRegression(
    C=0.1,
    max_iter=500,
    n_jobs=-1
)

model.fit(x_train, y_train)

with open('models/classification_model.pkl', 'wb') as handle:
    pickle.dump(model, handle)

print('Train Accuracy:', accuracy_score(y_train, model.predict(x_train)))
print('Val Accuracy:', accuracy_score(y_test, model.predict(x_test)))

Train Accuracy: 0.6133389615871676
Val Accuracy: 0.5868465430016864


In [9]:
# Sanity check
with open('models/classification_model.pkl', 'rb') as handle:
    model = pickle.load(handle)

print('Val Accuracy:', accuracy_score(y_test, model.predict(x_test)))

embedding_model = Doc2Vec.load('models/embedding_model.pkl')
sample_doc = ['hi', 'i', 'am', 'vasu'] # Includes out-of-vocab words
assert doc_into_vector(sample_doc, embedding_model).shape[0] == EMBEDDING_SIZE

Val Accuracy: 0.5868465430016864
