In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv('movie_review.csv')

In [3]:
df.head()

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos


In [12]:
X = df['text']

In [13]:
X

0        films adapted from comic books have had plenty...
1        for starters , it was created by alan moore ( ...
2        to say moore and campbell thoroughly researche...
3        the book ( or " graphic novel , " if you will ...
4        in other words , don't dismiss this film becau...
                               ...                        
64715    that lack of inspiration can be traced back to...
64716    like too many of the skits on the current inca...
64717    after watching one of the " roxbury " skits on...
64718     bump unsuspecting women , and . . . that's all .
64719    after watching _a_night_at_the_roxbury_ , you'...
Name: text, Length: 64720, dtype: object

In [14]:
vectorizer = CountVectorizer() # build new vectorizer
vectorizer.fit(X)              # extract all words in corpus and prepare features
X = vectorizer.transform(X)    # count words in each text

In [15]:
y = df['tag']

In [16]:
y

0        pos
1        pos
2        pos
3        pos
4        pos
        ... 
64715    neg
64716    neg
64717    neg
64718    neg
64719    neg
Name: tag, Length: 64720, dtype: object

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
model = LogisticRegression()

In [18]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [21]:
p_train = model.predict(X_train)
p_test = model.predict(X_test)

In [23]:
accuracy_score(p_train, y_train)


0.8511742892459827

In [24]:
accuracy_score(p_test,y_test)

0.6930160692212608

In [26]:
vectorizer.get_feature_names()[3000:3050]

['baileys',
 'bailing',
 'bails',
 'baily',
 'baio',
 'baird',
 'bait',
 'bakalian',
 'bake',
 'baked',
 'baker',
 'bakersfield',
 'baking',
 'bakker',
 'bakkers',
 'bakshi',
 'bakula',
 'bala',
 'balaban',
 'balance',
 'balanced',
 'balances',
 'balancing',
 'balasko',
 'balbricker',
 'balcony',
 'bald',
 'balderdash',
 'balding',
 'baldly',
 'baldwin',
 'bale',
 'balinski',
 'balk',
 'balki',
 'ball',
 'ballad',
 'ballads',
 'ballard',
 'ballentine',
 'ballerina',
 'ballet',
 'balletic',
 'ballhaus',
 'ballinagra',
 'balliol',
 'ballisitic',
 'ballistic',
 'ballistics',
 'balloon']

In [27]:
new_X = [
    'I hate this movie, it has bad acting and the plot sucks',
    'Beautiful story, suspence and great movie overall. I love it'
]

In [29]:
new_X_v = vectorizer.transform(new_X)

In [30]:
new_X_v

<2x39659 sparse matrix of type '<class 'numpy.int64'>'
	with 19 stored elements in Compressed Sparse Row format>

In [31]:
model.predict(new_X_v)

array(['neg', 'pos'], dtype=object)

In [32]:
vectorizer.get_feature_names()[3000:3010]

['baileys',
 'bailing',
 'bails',
 'baily',
 'baio',
 'baird',
 'bait',
 'bakalian',
 'bake',
 'baked']

In [35]:
model.coef_[0][3000:3010]

array([ 0.12614393,  0.        ,  0.        ,  0.32099007, -0.21106108,
       -0.49162052, -0.44763678, -0.08218003, -0.29988996,  0.1503814 ])