In [None]:
import os
import sys
module_directory = os.path.join(os.getcwd().split('DSI_Capstone_Steemit')[0])
sys.path
sys.path.insert(1,module_directory)

from DSI_Capstone_Steemit import(
    ensure_directory,
    ensure_directories
)
import os
import csv


dir_list = ['word2vec_doc_matrix','word2vec_doc_matrix_desc','posts_tfidf',
            'posts_counts','word2vec_doc_matrix_avg',
            'word2vec_doc_matrix_avg_tfidf','posts_counts_desc',
            'posts_tfidf_desc']
ensure_directories(dir_list)

In [None]:
import pymssql
import pandas as pd
import numpy as np
import os
import re
import joblib
import nltk
from nltk.tokenize import word_tokenize
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

data_directory = '../data/'

posts_path = os.path.join(data_directory,'sample_data.csv')

### Load Data

In [None]:
df_posts = pd.read_csv(posts_path)

### Combine multiple updates to articles to get one body per post

In [None]:
combined_body = df_posts.groupby(['author','permlink']).agg(lambda x: ''.join(set(x))).reset_index()
combined_body = combined_body.ix[:,['body','author','permlink']]

### Remove duplicates, due to multiple updates, then combine with full body text

In [None]:
# Remove Duplicates
idx_not_duplicates = ~df_posts.duplicated(['author','permlink'])
df_posts = df_posts.ix[idx_not_duplicates,:]
df_posts.drop('body',axis = 1,inplace=True)
df_posts = pd.merge(df_posts,combined_body,on=['author','permlink'])

In [None]:
df_posts.sort_values(by='total_payout_value',ascending=False,inplace=True)

### Text Cleaning

In [None]:
expression = r'http\S+'

# Extract all Links
df_posts['body urls'] = df_posts['body'].str.findall(expression)

# Remove all Links
df_posts['body'] = df_posts['body'].str.replace(expression,'')

# Remove all periods
expression = '\.'
df_posts['body'] = df_posts['body'].str.replace(expression,' ')

# Extract/Remove Markdown Related for Headers
expression = '0A0A(.*?)0A0A'
df_posts['body headers'] = df_posts['body'].str.findall(expression)

expression = '0A0A'
df_posts['body'] = df_posts['body'].str.replace(expression,' ')

# Extract/Remove Markdown Related for Code
expression = '60(.*?)60'
df_posts['body code'] = df_posts['body'].str.findall(expression)

expression = '\d+'
df_posts['body'] = df_posts['body'].str.replace(expression,' ')




# Remove all non alpha numeric
expression = '[^A-Za-z0-9 ]+'
df_posts['body'] = df_posts['body'].str.replace(expression,'')


# Remove Any Capital Letter by themselves A, B, C, D etc
expression = r'\b[A-Z]\b'
df_posts['body'] = df_posts['body'].str.replace(expression,'')


# Remove double spaces
expression = ' +'
df_posts['body'] = df_posts['body'].str.replace(expression,' ')

# Remove pure numerical values that have greater than 5 digits
expression = r'\b[0-9]{5,100}\b'
df_posts['body'] = df_posts['body'].str.replace(expression,'')


### Create and save Word Counts, TFIDF

In [None]:
from nltk.stem import PorterStemmer
from nltk import word_tokenize 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

porter =  PorterStemmer()
class PorterTokenizer(object):
    def __init__(self):
        self.porter = porter.stem
    def __call__(self, doc):
        return [self.porter(t) for t in word_tokenize(doc)]

countvect = CountVectorizer(
    encoding = 'utf-8',
    tokenizer = PorterTokenizer(),
    stop_words = stopwords.words('english'),
    lowercase = False
    
)

tfidfvect = TfidfVectorizer(
    encoding = 'utf-8',
    tokenizer = PorterTokenizer(),
    stop_words = stopwords.words('english'),
    lowercase = False
    
)

In [None]:
posts_counts = countvect.fit_transform(df_posts['body'])
posts_tfidf = tfidfvect.fit_transform(df_posts['body'])

In [None]:
posts_counts_path = os.path.join(data_directory,'posts_counts', 'posts_counts')
posts_tfidf_path = os.path.join(data_directory,'posts_tfidf', 'posts_tfidf')


joblib.dump(posts_counts,posts_counts_path)
joblib.dump(countvect.get_feature_names(),posts_counts_path+'_feature_names')



joblib.dump(posts_tfidf,posts_tfidf_path)
joblib.dump(tfidfvect.get_feature_names(),posts_tfidf_path+'_feature_names')



In [None]:
print df_posts.shape, posts_counts.shape

### Save data that goes with counts and vectorizers

In [None]:
df_posts.drop('body',axis = 1,inplace=True)

posts_counts_desc_path = os.path.join(data_directory,
                                             'posts_counts_desc', 
                                             'posts_counts_desc.csv')


df_posts.to_csv(posts_counts_desc_path,
                              index=False, 
                              quoting=csv.QUOTE_ALL, 
                              encoding='utf-8')

posts_tfidf_desc_path = os.path.join(data_directory,
                                             'posts_tfidf_desc', 
                                             'posts_tfidf_desc.csv')


df_posts.to_csv(posts_tfidf_desc_path,
                              index=False, 
                              quoting=csv.QUOTE_ALL, 
                              encoding='utf-8')