In [46]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [47]:
columns = ['sentence', 'sentiment']
df = pd.read_csv('imdb_labelled.txt', sep='\t', header=None)
df.columns = columns
df.head()

Unnamed: 0,sentence,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [48]:
tokens = []
for sent in df['sentence']:
    tokens.append(word_tokenize(sent))
    
tokens[0]    

['A',
 'very',
 ',',
 'very',
 ',',
 'very',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'about',
 'a',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [49]:
for i in range(len(tokens)):
    for j in range(len(tokens[i])):
        tokens[i][j] = tokens[i][j].lower()

tokens[0]        

['a',
 'very',
 ',',
 'very',
 ',',
 'very',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'about',
 'a',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [50]:
for i in range(len(tokens)):
    words = []
    for word in tokens[i]:
        if word not in stopwords.words("english"):
            words.append(word)
        tokens[i] = words
tokens[0]        

[',',
 ',',
 'slow-moving',
 ',',
 'aimless',
 'movie',
 'distressed',
 ',',
 'drifting',
 'young',
 'man',
 '.']

In [51]:
wnet = WordNetLemmatizer()
X = df['sentence'].values
y = df['sentiment'].values
X[0],y[0]

('A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
 0)

In [52]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(X)


In [53]:
X

X.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [54]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.25)

In [55]:
logistic = LogisticRegression()
logistic.fit(x_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [56]:
y_pred = logistic.predict(x_test)

In [57]:
accuracy_score(y_test, y_pred)

0.7754010695187166

In [58]:
review = ["the movie was really boring and i didn't enjoy it",
          "nice movie, worth to watch"]

review_tfidf = tfidf.transform(review)

In [59]:
my_pred = logistic.predict(review_tfidf)

In [61]:
print("0 for negetive review and 1 for positive review ")
my_pred

0 for negetive review and 1 fro positive review 


array([0, 1])