In [2]:
import pymssql
import pandas as pd
import numpy as np
import os
import re
import joblib
import sys 
import sys  
from scipy import sparse
from scipy.sparse import csr_matrix
# sys.path.append("../utils/")

# from utils import *
data_directory = '../data/'

In [3]:
def load_data_and_description(data_type = 'tfidf'):
    # check_for_dir_python_path()

    if data_type == 'tfidf':
        directory = 'posts_tfidf'
        file_name = 'posts_tfidf.pkl'
        feature_names = 'posts_tfidf_feature_names'
        desc_file = 'posts_tfidf_desc.csv'

    elif data_type == 'word2vec':
        file_name = 'word2vec_doc_matrix_avg'
        feature_names = 'word2vec_doc_matrix_avg_feature_names'
        desc_file = 'word2vec_doc_matrix_avg_desc.csv'
    else:
        directory = 'posts_counts'
        file_name = 'posts_counts.pkl'
        feature_names = 'posts_counts_feature_names'
        desc_file = 'posts_counts_desc.csv'

    data_path = os.path.join(data_directory,directory,file_name)
    feature_path = os.path.join(data_directory,directory,feature_names)
    desc_path = os.path.join(data_directory, directory,desc_file)

    data = joblib.load(data_path)
    data_desc = pd.read_csv(desc_path)

    # Do not have feature names for word2vec matrices
    if data_type == 'word2vec':
        feature_names = None
    else:
        feature_names = joblib.load(feature_path)

    return data,feature_names,data_desc

**load data**

In [6]:
data,feature_names,data_desc = load_data_and_description(data_type='posts_tfidf')

In [None]:
data.shape

(111178, 425384)

**load in **

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.cross_validation import train_test_split



In [8]:
data

<111178x425384 sparse matrix of type '<type 'numpy.int64'>'
	with 15979141 stored elements in Compressed Sparse Row format>

In [10]:
data.shape

(111178, 425384)

In [11]:
rfc = RandomForestClassifier()

**create labels**

In [12]:
labels = data_desc['total_payout_value']
labels = (data_desc['total_payout_value'] >  np.median(data_desc['total_payout_value'])).astype(int)

In [13]:
#double check to make sure they have the same number of records
l, w = data.shape
len(labels) == l

True

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.33, random_state=42)

**Run Model**

In [15]:
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [16]:
train_score = rfc.score(X_train, y_train)
test_score = rfc.score(X_test, y_test)

In [17]:
train_score

0.97340546926391813

In [18]:
test_score

0.6301343727002644

In [19]:
weights = rfc.feature_importances_

In [20]:
len(weights)

425384

In [44]:
len(feature_names)

425384

In [57]:
sorted(weights)

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0

In [62]:
weights_sorted, feature_sorted = zip(*sorted(zip(weights,feature_names),reverse=True))

In [70]:
for i in xrange(25):
    print feature_sorted[i]

The
like
new
find
world
thi
onli
best
make
We
post
wa
seri
love
go
look
Thi
kind
It
interest
get
time
work
peopl
insid
