## Practice 11 - *Ensembles intro*

*Simple tf-idf + Random forest routine for sentiment analysis*

Kolos Maria BSE-141

In [69]:
import pandas as pd
import numpy as np

#### 1. Reading the training data in order to obtain a list of texts and numpy.array of scores. 

In [70]:
file = open('data/train_feedback.txt')
data=file.readlines()
file.close()

In [71]:
texts=[]
scores=[]
for line in data:
    raw=eval(line)
    texts.append(raw['text'])
    scores.append(raw['score'])
scores=np.array(scores)

#### 2. Obtaining the tf-idf representations via TfidfVectorizer from the sklearn library. Tuning min_df 

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split#using python 3.5 and scipy 0.17
from sklearn.metrics import mean_squared_error

train_size=50000
N=len(texts)

def get_samples(min_df):
    vect= TfidfVectorizer(sublinear_tf=True, min_df=min_df, analyzer='word',  stop_words='english')
    matrix=vect.fit_transform(texts)
    train_X, test_X,train_y,test_y=train_test_split(matrix,scores, train_size=train_size/N)
    return train_X, test_X,train_y,test_y

In [None]:
#choosing optimal min_df on default RandomForestRegressor due to computation speed issue
matrix_len=[]
mse_score=[]
frequency=np.arange(0.001, 0.005,0.001)


for freq in frequency:
    train_X, test_X,train_y,test_y=get_samples(freq)
    rfr=RandomForestRegressor(random_state = 42)
    rfr.fit(train_X,train_y)
    res=((mean_squared_error(test_y, rfr.predict(test_X)),freq))
    print(res,matrix.shape[1])
    mse_score.append(res)

#### 3. Randomly splitting the data into training (50000 items) and validation sets. Tuining the model with RandomizedSearchCV

In [26]:
from sklearn.grid_search import RandomizedSearchCV
param_dist = {"max_features": np.linspace(0.01,0.1,10),
              "min_samples_leaf": [2,3,4]}

train_X, test_X,train_y,test_y=train_X, test_X,train_y,test_y=get_samples(0.001)

model = RandomForestRegressor(random_state =42)
n_iter_search = 10
random_search = RandomizedSearchCV(model, param_distributions=param_dist,n_iter=n_iter_search)

random_search.fit(train_X,train_y)


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=42,
           verbose=0, warm_start=False),
          fit_params={}, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'max_features': array([ 0.01,  0.02,  0.03,  0.04,  0.05,  0.06,  0.07,  0.08,  0.09,  0.1 ]), 'min_samples_leaf': [2, 3, 4]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=None, verbose=0)

Picking up the best parameters (for MSE criterion)

In [35]:
random_search.best_params_ 

{'max_features': 0.070000000000000007, 'min_samples_leaf': 2}

Training the final model with the best parameters and  evaluating the MSE error of the obtained regressor compared to the trivial baseline

In [None]:
train_X, test_X,train_y,test_y=get_samples(0.001)
model = RandomForestRegressor(n_estimators=100,random_state =42,max_features=0.07)
model.fit(train_X,train_y)
print(mean_squared_error(test_y, model.predict(test_X)))

In [44]:
#trival baseline
mean_score=np.mean(train_y)
dummy_predict_y=[mean_score]*len(test_y)

print(mean_squared_error(test_y,dummy_predict_y))

1.43694585811


MSE score reduced nearly twice

#### 4. Checking the interpretability of the obtained regressor. 


In [None]:
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]
vocabulary =vect.get_feature_names()

print("10 most important features")

for f in range(10):
    print((vocabulary[indices[f]], importances[indices[f]]))

They seem to be reasonable and they all have strong negative context which is likely to affect the score unambiguously (down)