In [31]:
import psycopg2
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
import re
import scipy.stats as scs
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
%matplotlib inline
plt.style.use("ggplot")

# Importing data from DB

**Import explore file**

In [12]:
from explore import *

**Make mh dfs and dictionary**

In [3]:
# mh_anx dfs and md

mh_anx_post, mh_anx_user, md = make_mh_df()

**Make pc dfs and update dictionary**

In [4]:
pc_anx_post, pc_anx_user, md = make_pc_df(md)

**Merge dfs**

In [5]:
anx_post, anx_user = merge_df(mh_anx_post, pc_anx_post, mh_anx_user, pc_anx_user)

**Check out dfs**

In [6]:
anx_post.head()

Unnamed: 0,pid,user,post_title,post,post_type,mood,thread_title,forum_name
0,1592723,Andy1963,Sudden Onset Anxiety - Please help!,hi ive never suffered any sort of anxiety befo...,author,,Sudden Onset Anxiety - Please help!,Generalized Anxiety Disorder
1,1592736,Zardos,,i can sympathize i get crippling anxiety when ...,responder,,Sudden Onset Anxiety - Please help!,Generalized Anxiety Disorder
2,1594183,rachelangelo,,im sorry youre dealing with this i dont know i...,responder,,Sudden Onset Anxiety - Please help!,Generalized Anxiety Disorder
3,1598245,akash,,apart from medical help there are many tricks ...,responder,,Sudden Onset Anxiety - Please help!,Generalized Anxiety Disorder
4,585119,Mayfair,,great tune i had my beatles number one hits i...,responder,,didnt know where to post this,Social Anxiety


In [7]:
anx_user.head()

Unnamed: 0,user,member_since
0,steviep43,Jan 2018
1,Chalmers333,Jan 2018
2,JenIAm,Jan 2018
3,Aw3092,Jan 2018
4,srussells,Jan 2017


# Basic NLP for Demo

In [13]:
# create docs and labels

users, docs = make_docs_labels(md)

In [27]:
# create vectorizer instance
# vectorize docs

vectorizer = TfidfVectorizer(stop_words = 'english')
vect = vectorizer.fit_transform(docs)

In [43]:
#joblib the model
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']

In [32]:
# joblib the matrix
joblib.dump(vect, "vect.pkl")



['vect.pkl']

In [34]:
#load the model with joblib

test = joblib.load("vect.pkl")

In [36]:
joblib.dump(md, "md.pkl")

['md.pkl']

In [37]:
d = joblib.load("md.pkl")

In [44]:
users = [key for key in md]
documents = [md[user] for user in users]
docs = [" ".join(doc) for doc in documents]

In [45]:
query = ['i have a lot of anxiety, especially at night']

In [46]:
vectorizer = TfidfVectorizer(stop_words = 'english')

In [58]:
vect = vectorizer.fit_transform(docs).toarray()

In [60]:
query_vect = vectorizer.transform(query).toarray()

In [61]:
cos_sim = linear_kernel(vect, query_vect)

In [82]:
top_sims = np.argsort(cos_sim, axis = None)[-1:-4:-1]

In [83]:
top_sims

array([1416, 4122, 1364])

# Building a classifier

- using naive bayes classifier to help classify responder posts that are personal

### get data ready for Naive Bayes

- make assumption about what is personal and not personal

In [24]:
#ignore warnings
import warnings
warnings.filterwarnings("ignore")

**load pkl file**

In [21]:
#is anx_post df with manual labels
df = pd.read_pickle("df_man.pkl")

**split into auth and responder**

In [22]:
auth = df[df['post_type'] == 'author']
res = df[df['post_type'] == 'responder']

**split responder df into 2dfs - 1) contain a personal regex string, 2) does not contain regex string**

**relabel personal responders as 0**

In [25]:
res_per = res[res['post'].str.contains("(i feel|im feeling|im worried|my anxiety|i have felt)", regex = True)]
res_no = res.drop(res_per.index)

res_per['label'] = 0 

**merge author df and the responder df that contains personal posts** 

**relabel other df of responders that does not contain personal regex string** 

In [27]:
#know the type of post
df_know = pd.concat([auth, res_per])
df_know.reset_index(drop = True, inplace = True)

#responders do not contain regex string
df_res = res_no
df_res.reset_index(drop = True, inplace = True)

**Filter responder df so that it will only contain impersonal posts**

**create a new df of responders where i'm uncertain about the content of post called df_predict**

In [28]:
indices_to_remove = []
for i,p in enumerate(df_res['post']):
    len_doc = len(p.split())
    if len(p.split()) == 0:
        indices_to_remove.append(i)
        continue
        
    numi = p.split().count("i") + p.split().count("im")
    numyou = p.split().count("you")+ p.split().count("your") + p.split().count("youre") + p.split().count("u")
    i_rate = numi/len_doc
    you_rate = numyou/len_doc
    
    if df_res['label'][i] == 0:
        continue
    elif (i_rate > you_rate):
        indices_to_remove.append(i)
    elif (i_rate > you_rate):
        indices_to_remove.append(i)
    else:
        continue
        
df_predict = pd.concat([df_res.iloc[indices_to_remove]])
df_res.drop(df.index[[indices_to_remove]], inplace = True)
df_res.reset_index(drop= True, inplace = True)

**Filter out authors who don't have any 1st person pronouns and add to df_predict (uncertain)**

**Create df_know (have a good idea of content of post - personal or not)**

**df_know - used to train Naive Bayes**

**df_predict - used to predict other responses**


In [29]:
author_to_remove = []
for i,p in enumerate(df_know['post']):
    if p.split().count("i") + p.split().count("im") == 0:
        author_to_remove.append(i)
    else:
        continue

        
df_predict = pd.concat([df_predict, df_know.iloc[author_to_remove]])
df_know.drop(df_know.index[[author_to_remove]], inplace = True)

df_know = pd.concat([df_know, df_res])
df_know.reset_index(drop = True, inplace = True)

df_predict.reset_index(drop = True, inplace = True)

**Look at value counts of my labels in df_know**

In [30]:
df_know['label'].value_counts()

1    19853
0    11395
Name: label, dtype: int64

**create docs and find word counts for NB**

In [31]:
#create docs
docs = [df_know['post'][i] for i in range(0,len(df_know))]

#get tokens
tokens = set()
for doc in docs:
    tokens.update(doc.split())
    
tokens_list = list(tokens)

vocab_dict = {word: i for i, word in enumerate(tokens_list)}

import numpy as np

word_counts = np.zeros((len(docs), len(tokens)))
for doc_id, words in enumerate(docs):
    for word in words.split():
        word_id = vocab_dict[word]
        word_counts[doc_id][word_id] += 1

## Train Naive Bayes Model

**create Naive Bayes instance**

In [33]:
clf = MultinomialNB()

**Train,Test split**

In [34]:
X = word_counts
y = np.array(df_know['label'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state = 38)

**Fit model**

In [35]:
clf.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

**Test model accuracy (based on assumptions i already made)**

In [36]:
clf.score(X_test,y_test)

0.89695999999999998

## Predict with NB

- Predict on uncertain responders

**create docs**

In [37]:
docs = df_predict['post']

**get word counts**

In [38]:
import numpy as np

word_counts = np.zeros((len(docs), len(tokens)))
for doc_id, words in enumerate(docs):
    for word in words.split():
        if word not in vocab_dict:
            continue
        else:
            word_id = vocab_dict[word]
            word_counts[doc_id][word_id] += 1

**soft classification**

In [39]:
X = word_counts

labels_proba = clf.predict_proba(X)

**hard classification**

In [40]:
labels = clf.predict(X)


**drop hold labels which are just 9 for uncertain**

In [41]:
df_predict.drop("label", axis = 1, inplace = True)

**Add predicted soft and hard labels**

In [43]:
df_predict['hard'] = labels
df_predict['soft'] = labels_proba[:,0]
df_predict.reset_index(drop = True, inplace = True)

**Check prediction**

In [45]:
pd.set_option('max_colwidth' , 200)
df_predict.head(10)

Unnamed: 0,pid,user,post_title,post,post_type,mood,thread_title,forum_name,hard,soft
0,1592736,Zardos,,i can sympathize i get crippling anxiety when i go to bed and first thing on a morning it gets better during the day when im busy and then at night i dont want to get into bed has anything chang...,responder,,Sudden Onset Anxiety - Please help!,Generalized Anxiety Disorder,0,1.0
1,585119,Mayfair,,great tune i had my beatles number one hits in my car last week they are genius with chords thats their secret but only weirdos like me recognise it my car got broke into last week and they empti...,responder,,didnt know where to post this,Social Anxiety,0,0.953017
2,586791,Ainsworth11,,why did i post this in social anxiety ah well never mind i got some sleepers from boots yesterday slept for 4 hours but felt i had been hit round the head with a baseball bat when i woke up as y...,responder,,didnt know where to post this,Social Anxiety,0,0.999999
3,1599105,Macka,,hi ark i also love writing something in me since i can remember when i was in the beginnings of my breakdown i wrote 30000 words in ten days my pc permanently crashed and so did my brain when...,responder,Inspired,"Is this GAD? My mind keeps creating associations between things and anxiety, and they're ruining my life!",Generalized Anxiety Disorder,0,1.0
4,586857,wendolene26,,the beatles help album was the first cd album i got given when my parents bought my a cd player hifi i was learning guitar and playing a lot of the beatles tunes as they have easy chords so i sup...,responder,Paranoid,didnt know where to post this,Social Anxiety,0,0.701172
5,589807,Ainsworth11,,seroqueltis all looking for understanding and i get a drug,responder,,didnt know where to post this,Social Anxiety,0,0.513875
6,1598695,blacksmoke,,hello janey1966 hey janey yeah the second half of life stinks i am really seeing my mother for who she really is fooled myself for the first half really sorry for what you are going through yeah ...,responder,Sad,Feeling More Anxious The Older I Get. I Am 51.,Generalized Anxiety Disorder,0,0.908417
7,1598189,frogsplash,,hi i would recommend watching some youtube videos regarding dealing with anxiety and worrying less,responder,,feeling anxious,Generalized Anxiety Disorder,1,0.361373
8,536359,yesican,,i wish it was just one beer one beer doesnt do anything for me it starts with a beer then two then three and so on i dont get drunk but i definitely do drink too much its not good for my health me...,responder,Blah,social anxiety... causing me to drink too much,Social Anxiety,0,1.0
9,536365,yesican,,no i am sure that we havent spoken before this is my first time talking about this on a forum,responder,Blah,social anxiety... causing me to drink too much,Social Anxiety,0,0.90622


# Messing around with NLP

In [124]:
users = ['joe', 'mary', 'bill']
docs = ['hey there trying to find my friend friend']

In [125]:
vectorizer1 = TfidfVectorizer(stop_words = 'english')
vect1 = vectorizer1.fit_transform(docs).toarray()

In [126]:
vect1

array([[ 0.81649658,  0.40824829,  0.40824829]])

In [127]:
vectorizer1.vocabulary_

{'friend': 0, 'hey': 1, 'trying': 2}

In [122]:
linear_kernel(vect1, vect1)

array([[ 1.]])

In [123]:
vectorizer2 = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2))
vect2 = vectorizer2.fit_transform(docs).toarray()

In [89]:
vect2

array([[ 0.35409974,  0.35409974,  0.35409974,  0.35409974,  0.35409974,
         0.49767483,  0.        ,  0.        ,  0.        ,  0.        ,
         0.35409974,  0.        ],
       [ 0.25096919,  0.25096919,  0.25096919,  0.25096919,  0.25096919,
         0.        ,  0.35272845,  0.35272845,  0.35272845,  0.35272845,
         0.25096919,  0.35272845]])

In [90]:
linear_kernel(vect2, vect2)

array([[ 1.        ,  0.53320876],
       [ 0.53320876,  1.        ]])