In [None]:
# Import libraries and modules
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import tree
%matplotlib inline

In [None]:
# Read and display data shape
imdb = pd.read_csv('IMDB_dataset.csv')
imdb.shape

In [None]:
# Display last 5 rows of data
imdb.tail()

In [None]:
# Summarize data
imdb.describe()

#### Check  distribution of data

In [None]:
class_dist = imdb.groupby('sentiment').size()
labels = 'positive','negative',
fig, ax = plt.subplots()
ax.set_title('Class Distribution', y=1.08)
ax.pie(class_dist, labels=labels, autopct='%1.1f%%',
       shadow=False, startangle=90)
ax.axis('equal')

In [None]:
# Import label encoder 
from sklearn import preprocessing 

# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 

# Encode labels in column 'species'. 
imdb['sentiment']= label_encoder.fit_transform(imdb['sentiment']) 

imdb['sentiment'].unique() 


#### Movie reviews vary in length. For example one movie review may contain 20 words while a second one 500 words.Below is a visualization of review length

In [None]:
lengths = [len(i) for i in imdb['review']]
print(f'Max length of sentence: {max(lengths)}')
print(f'Average length of sentence: {np.mean(lengths)}')

sns.distplot(lengths)

#### Bag of Words
In order to perform machine learning on text documents, we first need to turn the text content into numerical feature vectors. Text preprocessing, tokenizing and filtering of stopwords are all included in CountVectorizer, which builds a dictionary of features and transforms documents to feature vectors

In [None]:
cv = CountVectorizer(stop_words='english') 

#`data` is an array of strings

data_cv = cv.fit_transform(imdb['review']) 
 
print(data_cv)

CountVectorizer supports counts of N-grams of words or consecutive characters. Once fitted, the vectorizer has built a dictionary of feature indices:

In [None]:
#create a dictionary with feature names as keys and row elements as values
print(cv.vocabulary_)

#### TFIDF
Occurrence count is a good start but there is an issue: longer documents will have higher average count values than shorter documents, even though they might talk about the same topics.

To avoid these potential discrepancies it suffices to divide the number of occurrences of each word in a document by the total number of words in the document: these new features are called tf for Term Frequencies.

Another refinement on top of tf is to downscale weights for words that occur in many documents in the corpus and are therefore less informative than those that occur only in a smaller portion of the corpus.

This downscaling is called tf–idf for “Term Frequency times Inverse Document Frequency”.

Both tf and tf–idf can be computed as follows using TfidfTransformer:

In [None]:
tfidf_transformer = TfidfTransformer()
data_tfidf = tfidf_transformer.fit_transform(data_cv)
print(data_tfidf)

In [None]:
# Import train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_tfidf, imdb['sentiment'], test_size=0.30)

### Now that we have our features, we can train a classifier to try to predict the category of a post. Let’s start with a naïve Bayes classifier, which provides a nice baseline for this task. scikit-learn includes several variants of this classifier; the one most suitable for word counts is the multinomial variant:

####  Naive Bayes Classification

In [None]:
bayes_clf = MultinomialNB().fit(X_train, y_train)
bayes_clf_predicted = bayes_clf.predict(X_test)
print('Test accuracy: %.2f%%' % (np.mean(bayes_clf_predicted == y_test)*100))

In [None]:
#Classification report for bag of words for Naive Bayes classification
bayes_clf_report=classification_report(y_test,bayes_clf_predicted,target_names=['Positive','Negative'])
print(bayes_clf_report)

In [None]:
bayes_clf_cm=confusion_matrix(y_test,bayes_clf_predicted,labels=[1,0])
print(bayes_clf_cm)

# Plot Confusion Matrix for Naive Bayes
bayes_clf_dm = pd.DataFrame(bayes_clf_cm, index = [i for i in ['positive', 'negative']],
              columns = [i for i in ['positive', 'negative']])
plt.figure(figsize = (10,7))
sns.heatmap(bayes_clf_dm, annot=True,cmap="OrRd")

####  Linear SVC Classification

In [None]:
LinearSVC_clf = LinearSVC().fit(X_train, y_train)
LinearSVC_clf_predicted = LinearSVC_clf.predict(X_test)
print('Test accuracy: %.2f%%' % (np.mean(LinearSVC_clf_predicted == y_test)*100))

In [None]:
#Classification report for bag of words 
LinearSVC_clf_report=classification_report(y_test,LinearSVC_clf_predicted,target_names=['Positive','Negative'])
print(LinearSVC_clf_report)

In [None]:
LinearSVC_clf_cm=confusion_matrix(y_test,LinearSVC_clf_predicted,labels=[1,0])
print(LinearSVC_clf_cm)

# Plot Confusion Matrix
LinearSVC_clf_df_cm = pd.DataFrame(LinearSVC_clf_cm, index = [i for i in ['positive', 'negative']],
              columns = [i for i in ['positive', 'negative']])
plt.figure(figsize = (10,7))
sns.heatmap(LinearSVC_clf_df_cm, annot=True,cmap="OrRd")

####  Decision Tree Classification

In [None]:
DecisionTree_clf = tree.DecisionTreeClassifier().fit(X_train, y_train)
DecisionTree_clf_predicted = DecisionTree_clf.predict(X_test)
print('Test accuracy: %.2f%%' % (np.mean(DecisionTree_clf_predicted == y_test)*100))

In [None]:
#Classification report for bag of words 
DecisionTree_clf_report=classification_report(y_test,DecisionTree_clf_predicted,target_names=['Positive','Negative'])
print(DecisionTree_clf_report)

In [None]:
DecisionTree_clf_cm=confusion_matrix(y_test,DecisionTree_clf_predicted,labels=[1,0])
print(DecisionTree_clf_cm)

# Plot Confusion Matrix
DecisionTree_clf_df_cm = pd.DataFrame(DecisionTree_clf_cm, index = [i for i in ['positive', 'negative']],
              columns = [i for i in ['positive', 'negative']])
plt.figure(figsize = (10,7))
sns.heatmap(DecisionTree_clf_df_cm, annot=True,cmap="OrRd")

#### Word Cloud

In [None]:
from wordcloud import WordCloud,STOPWORDS

#word cloud for positive review words
plt.figure(figsize=(10,10))
positive_text=imdb.review[1]
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5)
positive_words=WC.generate(positive_text)
plt.imshow(positive_words,interpolation='bilinear')

In [None]:
#Word cloud for negative review words
plt.figure(figsize=(10,10))
negative_text=imdb.review[5052]
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5)
negative_words=WC.generate(negative_text)
plt.imshow(negative_words,interpolation='bilinear')
plt.show

In [None]:
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from bs4 import BeautifulSoup
import re,string,unicodedata

In [None]:
#Tokenization of text
tokenizer=ToktokTokenizer()
#Setting English stopwords
stopword_list=nltk.corpus.stopwords.words('english')

In [None]:
# #Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

#Removing the square brackets
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

#Define function for removing special characters
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-z0-9\s]'
    text=re.sub(pattern,'',text)
    return text

#Stemming the text
def simple_stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text= ' '.join([ps.stem(word) for word in text.split()])
    return text

#Removing the noisy text
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_special_characters(text)
    text = simple_stemmer(text)
    return text
#Apply function on review column
imdb['review']=imdb['review'].apply(denoise_text)

In [None]:
from nltk.corpus import stopwords
#set stopwords to english
stop=set(stopwords.words('english'))
print(stop)

#removing the stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#Apply function on review column
imdb['review']=imdb['review'].apply(remove_stopwords)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

# set parameters:
max_features = 6000
maxlen = 130
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 1


tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(imdb['review'])
# list_tokenized_train = tokenizer.texts_to_sequences(imdb['review'])


# review = pad_sequences(list_tokenized_train, maxlen=maxlen)
# sentiment = imdb.sentiment

# x_train, x_test, y_train, y_test = train_test_split(review, sentiment, test_size=0.30)

# print(len(x_train), 'train sequences')
# print(len(x_test), 'test sequences')

# print('Build model...')
# model = Sequential()

# # we start off with an efficient embedding layer which maps
# # our vocab indices into embedding_dims dimensions
# model.add(Embedding(max_features,
#                     embedding_dims,
#                     input_length=maxlen))
# model.add(Dropout(0.2))

# # we add a Convolution1D, which will learn filters
# # word group filters of size filter_length:
# model.add(Conv1D(filters,
#                  kernel_size,
#                  padding='valid',
#                  activation='relu',
#                  strides=1))
# # we use max pooling:
# model.add(GlobalMaxPooling1D())

# # We add a vanilla hidden layer:
# model.add(Dense(hidden_dims))
# model.add(Dropout(0.2))
# model.add(Activation('relu'))

# # We project onto a single unit output layer, and squash it with a sigmoid:
# model.add(Dense(1))
# model.add(Activation('sigmoid'))

# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               metrics=['accuracy'])
# model.fit(x_train, y_train,
#           batch_size=batch_size,
#           epochs=epochs,
#           validation_data=(x_test, y_test))

In [None]:
imdb.head()

In [None]:
# from keras.preprocessing.text import Tokenizer
# from keras.preprocessing.sequence import pad_sequences
# from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation, GRU, Flatten
# from keras.layers import Bidirectional, GlobalMaxPool1D
# from keras.models import Model, Sequential
# from keras.layers import Convolution1D
# from keras import initializers, regularizers, constraints, optimizers, layers

# max_features = 5000
# tokenizer = Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(imdb['review'])
# list_tokenized_train = tokenizer.texts_to_sequences(imdb['review'])

# maxlen = 130
# X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
# y = imdb['sentiment']

# embed_size = 128
# model = Sequential()
# model.add(Embedding(max_features, embed_size))
# model.add(Bidirectional(LSTM(32, return_sequences = True)))
# model.add(GlobalMaxPool1D())
# model.add(Dense(20, activation="relu"))
# model.add(Dropout(0.05))
# model.add(Dense(1, activation="sigmoid"))
# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# batch_size = 100
# epochs = 1
# model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)