In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

import warnings; warnings.simplefilter('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

Load the dataset.
Here, we have dataset with 50k reviews(equally distributed, 25k positive and 25k negative)
you can see the top 5 reviews in the set 

In [2]:
reviews = pd.read_csv("/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv")
reviews.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
positive = reviews[reviews['sentiment']=='positive']
negative = reviews[reviews['sentiment']=='negative']
print(positive.shape)
print(negative.shape)

(25000, 2)
(25000, 2)


Check if there is any row which has NaN and if yes, remove it

In [4]:
features = reviews[reviews['review'].notnull() & reviews['sentiment'].notnull()]['review']

In [5]:
processed_features = []

for sentence in range(0, len(features)):
    # Remove all the special characters
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

    # remove all single characters
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

    # Remove single characters from the start
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

    # Substituting multiple spaces with single space
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

    # Removing prefixed 'b'
    processed_feature = re.sub(r'^b\s+', '', processed_feature)

    # Converting to Lowercase
    processed_feature = processed_feature.lower()

    processed_features.append(processed_feature)

In the script above, we start by removing all the special characters from the reviews. The regular expression re.sub(r'\W', ' ', str(features[sentence])) does that.

Next, we remove all the single characters left as a result of removing the special character using the re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature) regular expression. For instance, if we remove special character ' from Jack's and replace it with space, we are left with Jack s. Here s has no meaning, so we remove it by replacing all single characters with a space.

However, if we replace all single characters with space, multiple spaces are created. Therefore, we replace all the multiple spaces with single spaces using re.sub(r'\s+', ' ', processed_feature, flags=re.I) regex. Furthermore, if your text string is in bytes format a character b is appended with the string. The above script removes that using the regex re.sub(r'^b\s+', '', processed_feature).

Finally, the text is converted into lowercase using the lower() function.

# Representing Text in Numeric Form

Statistical algorithms use mathematics to train machine learning models. However, mathematics only work with numbers. To make statistical algorithms work with text, we first have to convert text to numbers. To do so, three main approaches exist i.e. Bag of Words, TF-IDF and Word2Vec. In this section, we will discuss the bag of words and TF-IDF scheme.

**Bag of Words**

Bag of words scheme is the simplest way of converting text to numbers.

For instance, you have three documents:

    Doc1 = "I like to play football"
    Doc2 = "It is a good game"
    Doc3 = "I prefer football over rugby"

In the bag of words approach the first step is to create a vocabulary of all the unique words. For the above three documents, our vocabulary will be:

Vocab = [I, like, to, play, football, it, is, a, good, game, prefer, over, rugby]

The next step is to convert each document into a feature vector using the vocabulary. The length of each feature vector is equal to the length of the vocabulary. The frequency of the word in the document will replace the actual word in the vocabulary. If a word in the vocabulary is not found in the corresponding document, the document feature vector will have zero in that place. For instance, for Doc1, the feature vector will look like this:

[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]

**TF-IDF**

In the bag of words approach, each word has the same weight. The idea behind the TF-IDF approach is that the words that occur less in all the documents and more in individual document contribute more towards classification.

TF-IDF is a combination of two terms. Term frequency and Inverse Document frequency. They can be calculated as:

TF  = (Frequency of a word in the document)/(Total words in the document)

IDF = Log((Total number of docs)/(Number of docs containing the word))

TF-IDF using the Scikit-Learn Library

Luckily for us, Python's Scikit-Learn library contains the TfidfVectorizer class that can be used to convert text features into TF-IDF feature vectors. The following script performs this:

In the code above, we define that the max_features should be 2500, which means that it only uses the 2500 most frequently occurring words to create a bag of words feature vector. Words that occur less frequently are not very useful for classification.

Similarly, max_df specifies that only use those words that occur in a maximum of 80% of the documents. Words that occur in all documents are too common and are not very useful for classification. Similarly, min-df is set to 7 which shows that include words that occur in at least 7 documents.

In [6]:
vectorizer = TfidfVectorizer (max_features=3500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

# Divide Dataset into Train And Test Set

In the previous section, we converted the data into the numeric form. As the last step before we train our algorithms, we need to divide our data into training and testing sets. The training set will be used to train the algorithm while the test set will be used to evaluate the performance of the machine learning model.

In [7]:
labels = reviews['sentiment']
labels = labels=='positive'

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

# Training the Model
Once data is split into training and test set, machine learning algorithms can be used to learn from the training data. You can use any machine learning algorithm. However, we will use the Random Forest algorithm, owing to its ability to act upon non-normalized data.

The sklearn.ensemble module contains the RandomForestClassifier class that can be used to train the machine learning model using the random forest algorithm. To do so, we need to call the fit method on the RandomForestClassifier class and pass it our training features and labels, as parameters.

In [8]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

# Making Predictions and Evaluating the Model

Once the model has been trained, the last step is to make predictions on the model. To do so, we need to call the predict method on the object of the RandomForestClassifier class that we used for training.

In [9]:
predictions = text_classifier.predict(X_test)

Finally, to evaluate the performance of the machine learning models, we can use classification metrics such as a confusion metrix, F1 measure, accuracy, etc.

To find the values for these metrics, we can use classification_report, confusion_matrix, and accuracy_score utilities from the sklearn.metrics

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[4307  728]
 [ 771 4194]]
              precision    recall  f1-score   support

       False       0.85      0.86      0.85      5035
        True       0.85      0.84      0.85      4965

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

0.8501


# Train a different model : LogisticRegression

Earlier we have trained a random forest classifier.
Here, we'll train a **Logistic Regression Model** as our sentiment classifier.

To do that, we'll first divide our dataset into **Train, Test and Validation sets**.

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    processed_features, labels, train_size = 0.8, shuffle=True
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, train_size = 0.75, shuffle=True
)

# Training the Model over Train and Validation Sets
In the previous window, we have divided dataset into three parts.
Now, using train and validation sets, we'll train our logistic regression model.

We'll try some values to get best values for C.
We'll train and check which model gives highest accuracy and than using that value of C, we'll train our final model.

In [12]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print ("Accuracy for C=%s: %s" 
           % (c, accuracy_score(y_val, lr.predict(X_val))))
    
# Accuracy for C=0.01: 0.8255
# Accuracy for C=0.05: 0.8527
# Accuracy for C=0.25: 0.8708
# Accuracy for C=0.5: 0.877
# Accuracy for C=1: 0.8777 

Accuracy for C=0.01: 0.8311
Accuracy for C=0.05: 0.8606
Accuracy for C=0.25: 0.879
Accuracy for C=0.5: 0.8813
Accuracy for C=1: 0.8851


# Train Final Model and Calculate Accuracy over Test data

* Now, we'll train the model over best value of C i.e. 1

In [13]:
c = 1
lr = LogisticRegression(C=c)
lr.fit(X_train, y_train)
print ("Accuracy for C=%s: %s" 
       % (c, accuracy_score(y_test, lr.predict(X_test))))
    

Accuracy for C=1: 0.8893
