# Import

In [1]:
!pip3 install textblob



In [2]:
import re
import os
import numpy as np
import pandas as pd
from string import punctuation
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from bs4 import BeautifulSoup
from nltk.stem.snowball import EnglishStemmer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from textblob import Word

In [None]:
df = pd.read_csv ('IMDB-Dataset.csv')

In [None]:
df.head(10)

# Set Clean-up

In [None]:
def remove_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Removing the noisy text
def clean_text(text):
    text = remove_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    return text

def transformations(df):
    #Punctuation removal
    df['review'] = df['review'].apply(lambda words: [x for x in words if not x in punctuation])
    #Number removal
    df['review'] = df['review'].apply(lambda words: [x for x in words if not x.isdigit()])
    #Stopword removal
    df['review'] = df['review'].apply(lambda words: [x for x in words if x not in stopwords.words('english')])
    #Frequent word removal
    temp = df['review'].apply(lambda words: " ".join(words))
    freq = pd.Series(temp).value_counts()[:10]
    df['review'] = df['review'].apply(lambda words: [x for x in words if x not in freq.keys()])
    #Lemmatization
    df['review'] = df['review'].apply(lambda words: " ".join([Word(x).lemmatize() for x in words]))
    return df

#Apply function on review column
df['review'] = df['review'].apply(clean_text)



In [None]:
stemmer = EnglishStemmer()

def stem_words(tokenizedList):
    stemmedList = []
    for word in tokenizedList:
        stemmedList.append(stemmer.stem(word))
    return stemmedList

def smarter_tokenize_and_preprocess(text):
    tokenizedWords = nltk.word_tokenize(text)
    return stem_words(tokenizedWords)
df['review'] = df['review'].apply(smarter_tokenize_and_preprocess)

In [None]:
df.dropna()

# Split set

In [None]:
df.head(10)

In [None]:
#positive = df[df['sentiment'] == 'positive']

In [None]:
#negative = df[df['sentiment'] == 'negative']

In [None]:
 train, valid, test = \
              np.split(df.sample(frac=1, random_state=42), 
                       [int(.6*len(df)), int(.8*len(df))])

In [None]:
#Creating a Pipeline
clf = Pipeline(steps =[
('preprocessing', CountVectorizer()),
('classifier', LogisticRegression(dual=False,max_iter=2000))
])
#Fitting the model
clf.fit(X_train, y_train)

In [None]:
clf.score(X_valid, Y_valid)
clf.score(X_test,Y_test)

In [None]:
p = clf.predict(X_test) 

In [None]:
print(f'Number of reviews classified as Poitive: {list(p).count(1)}')
print(f'Number of reviews classified as Negative: {list(p).count(0)}')