# Import

In [1]:
!pip3 install textblob



In [15]:
import re
import os
import pandas as pd
from string import punctuation
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from bs4 import BeautifulSoup
from nltk.stem.snowball import EnglishStemmer

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gebruiker\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Gebruiker\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Gebruiker\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
from textblob import Word

In [5]:
df = pd.read_csv ('IMDB-Dataset.csv')

In [6]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


# Set Clean-up

In [7]:
def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Removing the noisy text
def clean_text(text):
    text = remove_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    return text

def transformations(df):
    #Punctuation removal
    df['review'] = df['review'].apply(lambda words: [x for x in words if not x in punctuation])
    #Number removal
    df['review'] = df['review'].apply(lambda words: [x for x in words if not x.isdigit()])
    #Stopword removal
    df['review'] = df['review'].apply(lambda words: [x for x in words if x not in stopwords.words('english')])
    #Frequent word removal
    temp = df['review'].apply(lambda words: " ".join(words))
    freq = pd.Series(temp).value_counts()[:10]
    df['review'] = df['review'].apply(lambda words: [x for x in words if x not in freq.keys()])
    #Lemmatization
    df['review'] = df['review'].apply(lambda words: " ".join([Word(x).lemmatize() for x in words]))
    return df

#Apply function on review column
df['review'] = df['review'].apply(clean_text)



In [8]:
stemmer = EnglishStemmer()

def stem_words(tokenizedList):
    stemmedList = []
    for word in tokenizedList:
        stemmedList.append(stemmer.stem(word))
    return stemmedList

def smarter_tokenize_and_preprocess(text):
    tokenizedWords = nltk.word_tokenize(text)
    return stem_words(tokenizedWords)
df['review'] = df['review'].apply(smarter_tokenize_and_preprocess)

In [9]:
df.dropna()

Unnamed: 0,review,sentiment
0,"[one, of, the, other, review, has, mention, th...",positive
1,"[a, wonder, littl, product, the, film, techniq...",positive
2,"[i, thought, this, was, a, wonder, way, to, sp...",positive
3,"[basic, there, a, famili, where, a, littl, boy...",negative
4,"[petter, mattei, love, in, the, time, of, mone...",positive
...,...,...
49995,"[i, thought, this, movi, did, a, down, right, ...",positive
49996,"[bad, plot, bad, dialogu, bad, act, idiot, dir...",negative
49997,"[i, am, a, cathol, taught, in, parochi, elemen...",negative
49998,"[im, go, to, have, to, disagre, with, the, pre...",negative


# Split set

In [10]:
df.head(10)

Unnamed: 0,review,sentiment
0,"[one, of, the, other, review, has, mention, th...",positive
1,"[a, wonder, littl, product, the, film, techniq...",positive
2,"[i, thought, this, was, a, wonder, way, to, sp...",positive
3,"[basic, there, a, famili, where, a, littl, boy...",negative
4,"[petter, mattei, love, in, the, time, of, mone...",positive
5,"[probabl, my, alltim, favorit, movi, a, stori,...",positive
6,"[i, sure, would, like, to, see, a, resurrect, ...",positive
7,"[this, show, was, an, amaz, fresh, innov, idea...",negative
8,"[encourag, by, the, posit, comment, about, thi...",negative
9,"[if, you, like, origin, gut, wrench, laughter,...",positive


In [11]:
#positive = df[df['sentiment'] == 'positive']

In [12]:
#negative = df[df['sentiment'] == 'negative']

In [16]:
X = df.drop(['sentiment'],axis=1).values 
y = df['sentiment'].values

X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.8)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

In [18]:
#Creating a Pipeline
clf = Pipeline(steps =[
('preprocessing', CountVectorizer()),
('classifier', LogisticRegression(dual=False,max_iter=2000))
])
#Fitting the model
clf.fit(X_train, y_train)

AttributeError: 'numpy.ndarray' object has no attribute 'lower'

In [None]:
clf.score(X_valid, Y_valid)
clf.score(X_test,Y_test)

In [None]:
p = clf.predict(X_test) 

In [None]:
print(f'Number of reviews classified as Poitive: {list(p).count(1)}')
print(f'Number of reviews classified as Negative: {list(p).count(0)}')