In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn
import re
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize , sent_tokenize
from scipy.spatial.distance import cosine, euclidean
import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('final-data.csv')
data.drop(labels = ['Unnamed: 0'] , axis=1,inplace = True)

In [4]:
data.head().T

Unnamed: 0,0,1,2,3,4
score_hidden,False,False,False,False,False
name,t1_cnas90l,t1_cnas91a,t1_cnas91b,t1_cnas92b,t1_cnas92q
link_id,t3_2qy8r4,t3_2qxfa5,t3_2qykcw,t3_2qwm98,t3_2qxogl
body,"Roofers, the only people on a job site more sa...",Have You Tried Turning It Off And On Again?,My Great to the power of 6 ..or 7 grandfather ...,/r/firstworldavjafmasdlfja,We were both born and raised in the Philippine...
downs,0,0,0,0,0
created_utc,1420070401,1420070403,1420070403,1420070405,1420070406
score,2,1,1,1,1
author,Movepeck,A_french_chinese_man,TomHanksDied,mwagner26,pizzaleftovers
id,cnas90l,cnas91a,cnas91b,cnas92b,cnas92q
parent_id,t1_cnaqrf6,t3_2qxfa5,t3_2qykcw,t1_cna9pi6,t3_2qxogl


In [5]:
df = pd.DataFrame({'clean': data['body']})
texts = [row for row in df['clean']]
texts

['Roofers, the only people on a job site more savage than the rock people.',
 'Have You Tried Turning It Off And On Again?',
 'My Great to the power of 6 ..or 7 grandfather came over here from Germany around 1776. On the ship ride over his mother dies. \n\nTwo weeks off the boat his father dies. He was only around 9 years old. \n\nHe then was sold as an "indigent servant" he earned his independence by fighting in the revolutionary war. \n\nHe moved from ohio to near Pittsburgh, lived to be 101 years old.  ',
 '/r/firstworldavjafmasdlfja',
 'We were both born and raised in the Philippines. She moved to Germany in 2009 (her dad was German and wanted to go back). Weekly sleepovers turned to monthly Skype calls (and the occasional Facebook message), the 7-hour difference was hard for us. Then just last year (2013), I moved to WA (mom got remarried here). The 9-hour difference was tougher. She had a boyfriend and new friends, I had a boyfriend and new friends. We grew into different interes

In [6]:
tagged_data = [TaggedDocument(words =word_tokenize(d.lower()) , tags = [str(i)]) for i, d in enumerate(texts)]
tagged_data

[TaggedDocument(words=['roofers', ',', 'the', 'only', 'people', 'on', 'a', 'job', 'site', 'more', 'savage', 'than', 'the', 'rock', 'people', '.'], tags=['0']),
 TaggedDocument(words=['have', 'you', 'tried', 'turning', 'it', 'off', 'and', 'on', 'again', '?'], tags=['1']),
 TaggedDocument(words=['my', 'great', 'to', 'the', 'power', 'of', '6', '..', 'or', '7', 'grandfather', 'came', 'over', 'here', 'from', 'germany', 'around', '1776.', 'on', 'the', 'ship', 'ride', 'over', 'his', 'mother', 'dies', '.', 'two', 'weeks', 'off', 'the', 'boat', 'his', 'father', 'dies', '.', 'he', 'was', 'only', 'around', '9', 'years', 'old', '.', 'he', 'then', 'was', 'sold', 'as', 'an', '``', 'indigent', 'servant', "''", 'he', 'earned', 'his', 'independence', 'by', 'fighting', 'in', 'the', 'revolutionary', 'war', '.', 'he', 'moved', 'from', 'ohio', 'to', 'near', 'pittsburgh', ',', 'lived', 'to', 'be', '101', 'years', 'old', '.'], tags=['2']),
 TaggedDocument(words=['/r/firstworldavjafmasdlfja'], tags=['3']),
 T

In [16]:
max_epochs = 50
vec_size = 300
alpha = 0.025
model = Doc2Vec(size = vec_size , alpha = alpha, workers = 11, min_alpha = 0.025)
model.build_vocab(tagged_data)
for epoch in range(max_epochs):
    print('Epoch-> {0}'.format(epoch))
    model.train(tagged_data, total_examples = model.corpus_count , epochs = model.iter)
    model.alpha -= 0.0002
    model.min_alpha = model.alpha
    
model.save("doc2v.model")
print("Model Trained")


Epoch-> 0
Epoch-> 1
Epoch-> 2
Epoch-> 3
Epoch-> 4
Epoch-> 5
Epoch-> 6
Epoch-> 7
Epoch-> 8
Epoch-> 9
Epoch-> 10
Epoch-> 11
Epoch-> 12
Epoch-> 13
Epoch-> 14
Epoch-> 15
Epoch-> 16
Epoch-> 17
Epoch-> 18
Epoch-> 19
Epoch-> 20
Epoch-> 21
Epoch-> 22
Epoch-> 23
Epoch-> 24
Epoch-> 25
Epoch-> 26
Epoch-> 27
Epoch-> 28
Epoch-> 29
Epoch-> 30
Epoch-> 31
Epoch-> 32
Epoch-> 33
Epoch-> 34
Epoch-> 35
Epoch-> 36
Epoch-> 37
Epoch-> 38
Epoch-> 39
Epoch-> 40
Epoch-> 41
Epoch-> 42
Epoch-> 43
Epoch-> 44
Epoch-> 45
Epoch-> 46
Epoch-> 47
Epoch-> 48
Epoch-> 49
Model Trained


In [7]:
model = Doc2Vec.load("doc2vec.model")


In [8]:
def preprocess(text):
    text = [text.lower()] 
    text = re.sub(r"[^a-zA-Z0-9]", " ", str(text)) 
    text = text.split()
    words = [w for w in text if w not in stopwords.words("english")]
    return words

data['cleaned'] = data.body.apply(preprocess)

In [9]:
def get_similarity(name):
    try:
        child = comment_dict[name]
        parent = comment_dict[id_dict[name]]
    except KeyError:
        return None
    if child == None:
        return None
    return get_angle(child, parent)

def get_angle(child, parent):
    child = model.infer_vector(child)
    parent = model.infer_vector(parent)
    cos = cosine(child , parent) 
    euc = euclidean(child , parent)
    return cos,euc
    

comment_dict = pd.Series(data['cleaned'].values , index=data['name']).to_dict()
id_dict = pd.Series(data['parent_id'].values, index=data['name']).to_dict()       

data['cos_euc_similarity'] = data.name.apply(lambda x: get_similarity(x))

data['parent_cosine'] = data.cos_euc_similarity.apply(lambda x: x[0] if x is not None else None)
data['parent_euc'] = data.cos_euc_similarity.apply(lambda x: x[1] if x is not None else None)

In [10]:
data.head().T

Unnamed: 0,0,1,2,3,4
score_hidden,False,False,False,False,False
name,t1_cnas90l,t1_cnas91a,t1_cnas91b,t1_cnas92b,t1_cnas92q
link_id,t3_2qy8r4,t3_2qxfa5,t3_2qykcw,t3_2qwm98,t3_2qxogl
body,"Roofers, the only people on a job site more sa...",Have You Tried Turning It Off And On Again?,My Great to the power of 6 ..or 7 grandfather ...,/r/firstworldavjafmasdlfja,We were both born and raised in the Philippine...
downs,0,0,0,0,0
created_utc,1420070401,1420070403,1420070403,1420070405,1420070406
score,2,1,1,1,1
author,Movepeck,A_french_chinese_man,TomHanksDied,mwagner26,pizzaleftovers
id,cnas90l,cnas91a,cnas91b,cnas92b,cnas92q
parent_id,t1_cnaqrf6,t3_2qxfa5,t3_2qykcw,t1_cna9pi6,t3_2qxogl


In [11]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer() 
 
def get_sentiment(x):
    return analyser.polarity_scores(x)

sentiment = data.body.apply(get_sentiment)

data['senti_neg'] = sentiment.apply(lambda x: x['neg'])
data['senti_neu'] = sentiment.apply(lambda x: x['neu'])
data['senti_pos'] = sentiment.apply(lambda x: x['pos'])
data['senti_comp'] = sentiment.apply(lambda x: x['compound'])

In [12]:
data.head().T

Unnamed: 0,0,1,2,3,4
score_hidden,False,False,False,False,False
name,t1_cnas90l,t1_cnas91a,t1_cnas91b,t1_cnas92b,t1_cnas92q
link_id,t3_2qy8r4,t3_2qxfa5,t3_2qykcw,t3_2qwm98,t3_2qxogl
body,"Roofers, the only people on a job site more sa...",Have You Tried Turning It Off And On Again?,My Great to the power of 6 ..or 7 grandfather ...,/r/firstworldavjafmasdlfja,We were both born and raised in the Philippine...
downs,0,0,0,0,0
created_utc,1420070401,1420070403,1420070403,1420070405,1420070406
score,2,1,1,1,1
author,Movepeck,A_french_chinese_man,TomHanksDied,mwagner26,pizzaleftovers
id,cnas90l,cnas91a,cnas91b,cnas92b,cnas92q
parent_id,t1_cnaqrf6,t3_2qxfa5,t3_2qykcw,t1_cna9pi6,t3_2qxogl


In [13]:
data.to_csv('processed-data.csv')