### Load Movie Reviews Data

In [0]:
from google.colab import drive
drive.mount('/gdrive')

In [0]:
# read file into pandas using a relative path. Please change the path as needed
import pandas as pd
movies_df = pd.read_table('/gdrive/My Drive/Great Learning/Statistical NLP/Notebooks/data/labeledTrainData.tsv.zip')

In [0]:
#Number of reviews
movies_df.shape

In [0]:
movies_df.sample(n=5)

In [0]:
movies_df.groupby('sentiment').count()

#### Install and import NLTK

In [0]:
!pip install nltk --quiet

In [0]:
import nltk

In [0]:
nltk.download('punkt')

### Stemmization

In [0]:
from nltk.stem import PorterStemmer

In [0]:
#Function to Stem words
def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

Let's apply stemming to the first review

In [0]:
#first review without stemming
movies_df.loc[0, 'review']

In [0]:
#Stemming for first review
get_stemmed_text([movies_df.loc[0, 'review']])

Stemming all reviews

In [0]:
#Create a new column to hold stemmed reviews
movies_df['stemmed_review'] = get_stemmed_text(movies_df['review'].tolist())

In [0]:
movies_df.head()

### Lemmatization

In [0]:
nltk.download('wordnet')

In [0]:
from nltk.stem import WordNetLemmatizer

In [0]:
def get_lemmatized_text(corpus):
    
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

In [0]:
#Lemmatization for first review
get_lemmatized_text([movies_df.loc[0, 'review']])

In [0]:
#Create a new column to hold lemmatized reviews
movies_df['lemmatized_review'] = get_lemmatized_text(movies_df['review'].tolist())

In [0]:
movies_df.head()

We can use either Lemmatized or Stemmed text for Vectorization (instead of original text)

### Vectorization

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [0]:
#Using it with regular text
vect = TfidfVectorizer()
vect.fit(movies_df['review'].tolist())
len(vect.get_feature_names())

In [0]:
#Using it with Lemmatized text
vect = TfidfVectorizer()
vect.fit(movies_df['lemmatized_review'].tolist())
len(vect.get_feature_names())

In [0]:
#Using it with Stemmed text
vect = TfidfVectorizer()
vect.fit(movies_df['stemmed_review'].tolist())
len(vect.get_feature_names())