In [82]:
import numpy as np 
import pandas 
from pattern.en import *
import thinkstats2
import thinkplot
import pattern
from sklearn.linear_model import LogisticRegression as LR
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer as TFIV


%matplotlib inline

In [83]:
import seaborn as sns
sns.set(color_codes=True)

In [84]:
train = pandas.read_csv('../train.tsv', sep = '\t') 
test = pandas.read_csv('../test.tsv', sep = '\t')

train.head(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


# Use Logistic Regression to train a model.
Inspiration was pulled from this article: https://jessesw.com/NLP-Movie-Reviews/ Logistic Regression is a scikit learn model, documentation for scikit learn can be found here: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

First, clean the data by sending the word to lowercase, remoing all punctuation marks, and spliting the words at every space. 

In [85]:
def cleanData(text):
    punctuation = {".", "/", "\\", ","}
    frequencyTracker = {}; 
    for mark in punctuation: 
        text = text.lower().replace(mark, " ")
    return text.lower().split()

In [86]:
#acqure all of the 'true' data, place all the data known to be true into array y_train
y_train = train['Sentiment']

In [87]:
#calls the clean data function for both the training and test data. Then appends the cleaned data to array train and test data, respectively.
traindata = []
for word in train['Phrase']: 
    traindata.append(" ".join(cleanData(word)))
testdata = []
for otherword in test['Phrase']: 
    testdata.append(" ".join(cleanData(otherword)))

Next we will use sklearn's TFIV library to vectorize the data and explore how often and how important a certain word occurs and is in the data set. This vectorization function that will later help determine which words are in positive reviews and which words are in negative reviews. Here is the documentation for the TFIV sklearn class: http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html 

As you can see from the documentation, the parameters involved are: 
"min-df": this ignores the frequency of words less than a specific threshold value. 
"max_features": If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.
"strip accents": removes any ascii or unicode characters that are not words. 
"analyzer": could be either words or characters to analyze
"token pattern": Regex denoting what constitutes a token or whateever as defined by analyzer"
"ngram_range": the lower and upper boundary for n gram parsing. An n gram is essentially spliting up the sentence into length n and making a model with these different data sets. iNstead of analyzing and training on each word or entire sentence it can be broken down into n grams. More info here: http://www.text-analytics101.com/2014/11/what-are-n-grams.html 
"use_idf": this allows the inverse document frequency to reweight itself. default is set to true. 
"smooth_idf": increases the document count for each word so that each word is at least in one document. This prevents dividing by 0. 
"sublinear_tf": sublinear scaling. Instead of log, use 1 + log(tf)
"stop_words": words that were ignored because of their low frequency count, exceeded maximum counts, or occured in too few documents.

In [88]:
tfv = TFIV(min_df=3,  max_features=None, 
        strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
        ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1,
        stop_words = 'english')

Then we will combine the train data and the test data in order to vectorize the data. After the tfv fits the data, then it transforms the documents into a document-terms matrix. Then seperates the data back into training set and testing set in order to continue with the process. The tfv transform returns a sparse matrix.
Finally, then we are ready and able to use Logistic Regression on the vectorized document-term matrix.

In [89]:
X_all = traindata + testdata # Combine both to fit the TFIDF vectorization.
lentrain = len(traindata)

tfv.fit(X_all) 
X_all = tfv.transform(X_all)

X = X_all[:lentrain] # Separate back into training and test sets. 
X_test = X_all[lentrain:]

In [90]:
X.shape

(156060, 89472)

We can see above that the shape of X, the sparse matrix, has more rows than columns.

In [91]:
y_train.shape

(156060,)

We can see above that y_train is a 1D array of sentiment values.

In [92]:
grid_values = {'C':[30]} # this number is chosen for the reasoning above. More Cs!

#Grid Search is an approximator for values that cannot be learned, but rather searches the space for the best fit. 
model_LR = GridSearchCV(LR(penalty = 'L2', dual = True, random_state = 0), 
                        grid_values) 
#fit the model 
model_LR.fit(X, y_train) # Fit the model.



GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='L2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1, param_grid={'C': [30]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [93]:
#mean scores, std, and params for all parameter combination of estimator functions
model_LR.grid_scores_

[mean: 0.53681, std: 0.00802, params: {'C': 30}]

In [94]:
#this is what the model predicts is the best estimator, this is like a line of best fit. 
#This is the estimator that gave the highest score in exhange for the smallest loss.
model_LR.best_estimator_

LogisticRegression(C=30, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='L2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

Once we have a model, now we can predict the results of the test, put the results in a csv, and submit to kaggle.

In [95]:
#use the model to precist the scores of the test data
#convert results to csv
LR_result = model_LR.predict(X_test) # We only need the probabilities that the movie review was a 7 or greater. 
LR_output = pandas.DataFrame(data={"PhraseId":test["PhraseId"], "Sentiment":LR_result}) # Create our dataframe that will be written.
LR_output.to_csv('Logistic_Reg_Proj2.csv', index=False, quoting=3) # Get the .csv file we will submit to Kaggle.

After uploading to kaggle, we discovered that the score was **0.58660**. This is interestingly low, but also close to our approximated, estimated mean. This was just an estimator model using linear regression on this estimation. We could have perhaps scored higher if we would have explored more with the scoring = 'roc_auc' attribute that belongs to GridSearchCV. This would just taken the area underneath the curve and perhaps provided a better approximation. 