In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

### Read movie_data

In [6]:
movie_data = pd.read_csv('../data/moviereviews_train.tsv',sep='\t')

In [7]:
print("movie_data is:", type(movie_data))
print("movie_data value type is",type(movie_data.values))
print("movie_data has", movie_data.shape[0], "rows and", movie_data.shape[1], "columns", "\n")
print("the data types for each of the columns in movie_data:")
print(movie_data.dtypes, "\n")
print("the first 10 rows in movie_data:")
print(movie_data.head(5))

movie_data is: <class 'pandas.core.frame.DataFrame'>
data value type is <class 'numpy.ndarray'>
movie_data has 25000 rows and 3 columns 

the data types for each of the columns in movie_data:
id           object
sentiment     int64
review       object
dtype: object 

the first 10 rows in movie_data:
       id  sentiment                                             review
0  5814_8          1  With all this stuff going down at the moment w...
1  2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...
2  7759_3          0  The film starts with a manager (Nicholas Bell)...
3  3630_4          0  It must be assumed that those who praised this...
4  9495_8          1  Superbly trashy and wondrously unpretentious 8...


### Create a `ndarray` for `L` for `movie_data`

In [8]:
L = movie_data["sentiment"]
print(type(L))
print(type(L.values))
print(L.shape)

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
(25000,)


### Create a `ndarray` for `X`


In [14]:
movie_data['word_count'] = movie_data['review'].str.split(' ').str.len()
movie_data['punc_count'] = movie_data['review'].str.count("\.")
X = movie_data[['word_count', 'punc_count']]
# print(movie_data.head(), "\n")

print(movie_data.head(10), "\n")
print(type(X))
print(type(X.values))
print(X.shape)

        id  sentiment                                             review  \
0   5814_8          1  With all this stuff going down at the moment w...   
1   2381_9          1  \The Classic War of the Worlds\" by Timothy Hi...   
2   7759_3          0  The film starts with a manager (Nicholas Bell)...   
3   3630_4          0  It must be assumed that those who praised this...   
4   9495_8          1  Superbly trashy and wondrously unpretentious 8...   
5   8196_8          1  I dont know why people think this is such a ba...   
6   7166_2          0  This movie could have been very good, but come...   
7  10633_1          0  I watched this video at a friend's house. I'm ...   
8    319_1          0  A friend of mine bought this film for £1, and ...   
9  8713_10          1  <br /><br />This movie is full of references. ...   

   word_count  punc_count  
0         433          20  
1         158          16  
2         378          20  
3         379           8  
4         367          

### Fit `SGDClassifier` linear classifier using `gradient descent`

In [15]:
from sklearn import linear_model
sgd = linear_model.SGDClassifier(loss="squared_loss")
sgd.fit(X, L)

SGDClassifier(loss='squared_loss')

In [16]:
print(sum(sgd.predict(X) == L.values))
print(sum(sgd.predict(X) == L.values) / L.shape)

12487
[0.49948]


### Test model predictions

In [17]:
import my_measures

sgd_pm = my_measures.BinaryClassificationPerformance(sgd.predict(X), L, 'sgd')
sgd_pm.compute_measures()
print(sgd_pm.performance_measures)

{'Pos': 12500, 'Neg': 12500, 'TP': 28, 'TN': 12459, 'FP': 41, 'FN': 12472, 'Accuracy': 0.49948, 'Precision': 0.4057971014492754, 'Recall': 0.00224, 'desc': 'sgd'}


### Normalization


In [18]:
X.describe()

Unnamed: 0,word_count,punc_count
count,25000.0,25000.0
mean,233.78624,13.08768
std,173.745845,9.811129
min,10.0,0.0
25%,127.0,7.0
50%,174.0,10.0
75%,284.0,16.0
max,2470.0,149.0


In [20]:
from sklearn import preprocessing
X_normalized = preprocessing.normalize(X)
pd.DataFrame(X_normalized).describe()

Unnamed: 0,0,1
count,25000.0,25000.0
mean,0.997652,0.060862
std,0.004993,0.030993
min,0.651214,0.0
25%,0.997459,0.043267
50%,0.99846,0.05547
75%,0.999064,0.071247
max,1.0,0.758895
