# Imports and Loading Data

In [1]:
import pandas as pd
import matplotlib
import seaborn as sns
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt

In [36]:
import gensim
import nltk, re
import sklearn
from gensim.models import word2vec
from sklearn.cluster import KMeans

In [6]:
from nltk import tokenize
import nltk.data
#nltk.download()

In [3]:
posts = pd.read_csv('depression_posts_full_corpus_201509.csv', low_memory=False)

In [4]:
posts = posts[posts['author'] != '[deleted]']

In [115]:
posts_texts = posts['selftext'].values

# Stop Words

In [9]:
from stop_words import get_stop_words

In [10]:
#retrive given stop word set
en_stop = get_stop_words('en')

#list of personal pronouns found in given stop word set
personal_pronouns = ['i', 'i\'d', 'i\'ll', 'i\'m', 'i\'ve',
                      'me', 'my','myself']

#removing personal pronouns from stop word array
for word in personal_pronouns:
    if word in en_stop:
        en_stop.remove(word)

# Cleaning Post Texts for Word2Vec

In [11]:
#punkt tokenizers tokenizes paragraph into sentences
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [27]:
#modified from tutorial on kaggle

def sentence_to_wordlist(text):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    #  
    # 1. Remove non-letters
    text = re.sub("[^a-zA-Z\']"," ", text)
    #
    # 2. Convert words to lower case and split them
    words = text.lower().split()
    #
    # 3. Remove Stop Words
    words = [w for w in words if not w in en_stop]
    #
    # 4. Return a list of words
    return(words)

def post_to_sentences(post):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(post.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append(sentence_to_wordlist(raw_sentence))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [116]:
posts_texts = [x for x in posts_texts if str(x) != 'nan']

In [119]:
posts_texts = [post_to_sentences(post) for post in posts_texts]

In [120]:
##important for assigning individual posts their clusters
unshuffled_posts = posts_texts.copy()

# Word2Vec and KMeans on Posts

In [121]:
num_texts = len(posts_texts)
print(num_texts)

112645


In [122]:
#shuffle posts_texts for some semblance of randomness
np.random.shuffle(posts_texts)

In [102]:
#creating word2vec model
initial_clusters = posts_texts[1:2000]
sentences = []
for post in initial_clusters:
    for sentence in post:
        sentences.append(sentence)
model = word2vec.Word2Vec(sentences, min_count=10)
model.init_sims(replace=True)

In [103]:
#forming initial clustering on model
kmeans_clustering = KMeans(n_clusters=15)
word_vectors = model.wv.syn0
idx = kmeans_clustering.fit_predict(word_vectors)

In [104]:
centers = kmeans_clustering.cluster_centers_
rest_of_posts = posts_texts[2000:]
while rest_of_posts:
    sentences = []
    current = rest_of_posts[:2000]
    for post in current:
        for sentence in post:
            sentences.append(sentence)
    model = word2vec.Word2Vec(sentences, min_count=10)
    kmeans_clustering = KMeans(n_clusters = 15, init=centers)
    word_vectors = model.wv.syn0
    idx = kmeans_clustering.fit_predict(word_vectors) 
    centers = kmeans_clustering.cluster_centers_
    rest_of_posts = rest_of_posts[2000:]

  n_jobs=self.n_jobs)


In [105]:
word_centroid_map = dict(zip( model.wv.index2word, idx ))

# Classifying Posts by Cluster

In [107]:
## borrowed/modified from kaggle
def create_bag_of_centroids(post, word_centroid_map):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for sentence in post:
        for word in sentence:
            if word in word_centroid_map:
                index = word_centroid_map[word]
                bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

In [126]:
centroids = list()

#creating bag of centroids for every post
for post in unshuffled_posts:
    centroids.append(create_bag_of_centroids(post, word_centroid_map))

In [127]:
## examining distribution of posts by cluster
counter = np.zeros(15)
for post in centroids:
    index = np.argmax(post)
    counter[index] += 1
print(counter)

[  1.89000000e+02   2.42800000e+03   5.89000000e+02   4.82790000e+04
   5.84980000e+04   2.10000000e+01   3.73000000e+02   6.19000000e+02
   2.24000000e+02   0.00000000e+00   1.03300000e+03   1.09000000e+02
   9.50000000e+01   4.10000000e+01   1.47000000e+02]


In [128]:
post_clusters = []
for post in centroids:
    post_clusters.append(np.argmax(post))

In [131]:
print(post_clusters[:50])

[4, 4, 4, 4, 4, 4, 4, 7, 3, 13, 10, 4, 3, 3, 7, 3, 3, 4, 4, 3, 4, 3, 3, 3, 4, 3, 4, 3, 3, 3, 4, 3, 3, 4, 3, 4, 3, 4, 4, 3, 4, 3, 4, 4, 6, 3, 4, 3, 4, 4]


## TODO
Determine usefulness of clusters <br>
Add clusters to features --> On separate notebook; Find better way to do this? <br>
Finish model to predict response rate