In [None]:
# module_directory = os.path.join(os.getcwd().split('DSI_Capstone_Steemit')[0],'DSI_Capstone_Steemit')
# sys.path
# sys.path.insert(1,module_directory)

import os

# Check for directory and if not there, create one
def ensure_directory(directory):
    directory = '../data/' + directory 
    print directory
    if not os.path.exists(directory):
        os.makedirs(directory)

# Check for each directory in the directory list
def ensure_directories(dir_list):
    for directory in dir_list:
        ensure_directory(directory)


dir_list = ['word2vec_doc_matrix','word2vec_doc_matrix_desc','posts_tfidf',
            'posts_counts','word2vec_doc_matrix_avg',
            'word2vec_doc_matrix_avg_tfidf']
ensure_directories(dir_list)

In [None]:
import pymssql
import pandas as pd
import numpy as np
import os
import re
import joblib
import nltk
from nltk.tokenize import word_tokenize
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

data_directory = '../data/'

posts_path = os.path.join(data_directory,'sample_29k_pos_values.csv')

### Load Data

In [None]:
df_posts = pd.read_csv(posts_path)

### Text Cleaning

In [None]:
expression = r'http\S+'

# Extract all Links
df_posts['body urls'] = df_posts['body'].str.findall(expression)

# Remove all Links
df_posts['body'] = df_posts['body'].str.replace(expression,'')

# Remove all non alpha numeric
expression = '[^A-Za-z0-9 ]+'
df_posts['body'] = df_posts['body'].str.replace(expression,'')


# Remove double spaces
expression = ' +'
df_posts['body'] = df_posts['body'].str.replace(expression,' ')


### Create and save Word Counts, TFIDF

In [None]:
from nltk.stem import PorterStemmer
from nltk import word_tokenize 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

porter =  PorterStemmer()
class PorterTokenizer(object):
    def __init__(self):
        self.porter = porter.stem
    def __call__(self, doc):
        return [self.porter(t) for t in word_tokenize(doc)]

    
    
countvect = CountVectorizer(
    encoding = 'utf-8',
    tokenizer = PorterTokenizer(),
    stop_words = stopwords.words('english'),
    lowercase = False
    
)

tfidfvect = TfidfVectorizer(
    encoding = 'utf-8',
    tokenizer = PorterTokenizer(),
    stop_words = stopwords.words('english'),
    lowercase = False
    
)

In [None]:
posts_counts = countvect.fit_transform(df_posts['body'])
posts_tfidf = tfidfvect.fit_transform(df_posts['body'])

In [None]:
posts_counts_path = os.path.join(data_directory,'posts_counts', 'posts_counts')
posts_tfidf_path = os.path.join(data_directory,'posts_tfidf', 'posts_tfidf')


joblib.dump(posts_counts,posts_counts_path)
joblib.dump(countvect.get_feature_names(),posts_counts_path+'_feature_names')



joblib.dump(posts_tfidf,posts_tfidf_path)
joblib.dump(tfidfvect.get_feature_names(),posts_tfidf_path+'_feature_names')



In [None]:
df_posts_counts.shape

In [None]:

df_posts_tfidf = pd.DataFrame(posts_tfidf.toarray())
df_posts_tfidf.columns = countvect.get_feature_names()


df_posts_tfidf_path = os.path.join(data_directory,'df_posts_tfidf', 'df_posts_tfidf.csv')
df_posts_tfidf.to_csv(df_posts_tfidf_path)



### Word2Vec
You need to download google Word2Vec Model

https://docs.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM&export=download

In [None]:
from gensim.models import Word2Vec

In [None]:
import gensim

# Load Google's pre-trained Word2Vec model.
word2vec_path = '../word2vec_models/GoogleNews-vectors-negative300.bin'
model = (gensim.
         models.
         Word2Vec.
         load_word2vec_format(word2vec_path,
                              binary=True))

In [None]:
model.most_similar(positive=['woman', 'king'], negative=['man'])


In [None]:
# Determine common vocab between Posts Vocabulary and Word2Vec Vocab

s1 = set(model.vocab.keys())
s2 = set(countvect.get_feature_names())

common_vocab = set.intersection(s1, s2)



df_posts_word2vec = df_posts_counts.ix[:,common_vocab]



In [None]:
df_posts_word2vec.shape

In [None]:
# Remove any posts that now have word counts of 0
idx_post_remove = (df_posts_word2vec.sum(axis = 1) != 0)

df_posts_word2vec_desc = df_posts.ix[idx_post_remove,:]
df_posts_word2vec = df_posts_word2vec.ix[idx_post_remove,:]

# Remove any vocab that have counts of 0
word2vec_vocab = df_posts_word2vec.columns[df_posts_word2vec.ix[0,:] > 0]

In [None]:
df_posts_word2vec_desc.shape

In [None]:
number_of_posts = df_posts_word2vec.shape[0]
word2vec_doc_matrix = np.zeros((number_of_posts,
                            len(df_posts_word2vec.columns),
                            300))

for i in range(number_of_posts):
    for k,vocab in enumerate(df_posts_word2vec.columns):
        word_count = df_posts_word2vec.ix[0,vocab]
        if word_count == 0:
            word_vector = np.zeros(300)
        else:
            word_vector = model[vocab]
        word2vec_doc_matrix[i,k,:] = word_vector

In [None]:
word2vec_doc_matrix.shape

In [None]:
word2vec_doc_matrix_path = os.path.join(data_directory,'word2vec_doc_matrix', 'word2vec_doc_matrix')
joblib.dump(word2vec_doc_matrix,word2vec_doc_matrix_path)

# Dataframe to describe word2vec matrix
df_posts_word2vec_desc_path = os.path.join(data_directory,'word2vec_doc_matrix_desc', 'df_posts_word2vec_desc.csv')
df_posts_word2vec_desc.drop('body',axis = 1).to_csv(df_posts_word2vec_desc_path)




Can try using average of Word2Vec to get Doc2Vec, or Multiply it by TFIDF then average

http://stackoverflow.com/questions/29760935/how-to-get-vector-for-a-sentence-from-the-word2vec-of-tokens-in-sentence

In [None]:
word2vec_doc_matrix_avg = word2vec_doc_matrix.mean(axis = 1)

In [None]:
word2vec_doc_matrix_avg.shape

In [None]:
word2vec_doc_matrix_avg_path = os.path.join(data_directory,'word2vec_doc_matrix_avg', 'word2vec_doc_matrix_avg')
joblib.dump(word2vec_doc_matrix_avg,word2vec_doc_matrix_avg_path)