In [1]:
# Basic imports
import os
import time
import timeit
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Machine Learning packages
from sklearn import ensemble
from sklearn.feature_selection import chi2, f_classif, SelectKBest 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.preprocessing import normalize

# Natural Language processing
import nltk
import re
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import fetch_rcv1
from sklearn.feature_extraction.text import TfidfVectorizer

# Clustering packages
import sklearn.cluster as cluster
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth, SpectralClustering, AffinityPropagation
from scipy.spatial.distance import cdist

# Plotly packages
import plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
from plotly import tools
import cufflinks as cf
import ipywidgets as widgets
from scipy import special
py.offline.init_notebook_mode(connected=True)

In [2]:
# Import the data
# (We only need the input(phrase) and output data(sentiment))
reviews_train = pd.read_csv("movie_reviews/train.tsv", sep="\t",usecols=['Phrase', 'Sentiment'])
reviews_test = pd.read_csv("movie_reviews/test.tsv", sep="\t",usecols=['Phrase'])

In [3]:
# Print the size of the dataframes
print("Training set size: " + str(reviews_train.shape[0]))
print("Testing set size: " + str(reviews_test.shape[0]))

Training set size: 156060
Testing set size: 66292


In [11]:
# Preview the data
pd.options.display.max_colwidth = 100
reviews_train.head(20)

Unnamed: 0,Phrase,Sentiment
0,A series of escapades demonstrating the adage that what is good for the goose is also good for t...,1
1,A series of escapades demonstrating the adage that what is good for the goose,2
2,A series,2
3,A,2
4,series,2
5,of escapades demonstrating the adage that what is good for the goose,2
6,of,2
7,escapades demonstrating the adage that what is good for the goose,2
8,escapades,2
9,demonstrating the adage that what is good for the goose,2


In [5]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub("[\[].*?[\]]", "", text)
    text = ' '.join(text.split())
    return text

In [12]:
# Remove annoying punctuation from the articles
pd.options.display.max_colwidth = 200
reviews_train['Phrase'] = reviews_train.Phrase.map(lambda x: text_cleaner(str(x)))
reviews_train.head()

Unnamed: 0,Phrase,Sentiment
0,"A series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .",1
1,A series of escapades demonstrating the adage that what is good for the goose,2
2,A series,2
3,A,2
4,series,2


In [15]:
lemmatizer = WordNetLemmatizer()

# Reduce all text to their lemmas
for summary in reviews_train['Phrase']:
    summary = lemmatizer.lemmatize(summary)

In [20]:
# Define input and output variables
X = reviews_train['Phrase']
y = reviews_train['Sentiment']

In [28]:
# Parameters for TF-idf vectorizer
vectorizer = TfidfVectorizer(max_df=0.5,
                             min_df=5, 
                             stop_words='english', 
                             lowercase=True, 
                             use_idf=True,
                             norm=u'l2',
                             smooth_idf=True
                            )

#Applying the vectorizer
X_tfidf=vectorizer.fit_transform(X)
print("Number of features: %d" % X_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.25, random_state=42)

#Removes all zeros from the matrix
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]

#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]

#List of features
terms = vectorizer.get_feature_names()

#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

# Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X[593])
print('Tf_idf vector:', tfidf_bypara[0])

Number of features: 14324
Original sentence: the gambles
Tf_idf vector: {'ago': 0.7638183797306258, 'years': 0.6454312378446532}


In [32]:
# Normalize the data
X_norm = normalize(X_train_tfidf)

# Convert from tf-idf matrix to dataframe
X_normal  = pd.DataFrame(data=X_norm.toarray())

# Preview the data
X_normal.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14314,14315,14316,14317,14318,14319,14320,14321,14322,14323
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Instantiating and fitting the 150 best features
kbest = SelectKBest(chi2, k=150)
X2_train = kbest.fit_transform(X_normal, y)

ValueError: Found input variables with inconsistent numbers of samples: [117045, 156060]

In [38]:
X_normal.shape

(117045, 14324)

In [40]:
y.shape

(156060,)