In [40]:
import os
import re
import nltk
import string
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pallavi.Saxena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pallavi.Saxena\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Pallavi.Saxena\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
input_path="data\IMDBDataset.csv"

In [14]:
df = pd.read_csv(input_path)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
df.shape

(50000, 2)

In [7]:
df.sentiment.value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [15]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=125)
train_df.shape,test_df.shape

((40000, 2), (10000, 2))

In [16]:
train_df.sentiment.value_counts()

positive    20007
negative    19993
Name: sentiment, dtype: int64

In [8]:
label_map={'positive': 1, 'negative': 0}

In [17]:
train_df['sentiment'] = train_df['sentiment'].map(label_map)
train_df['sentiment'].value_counts()

1    20007
0    19993
Name: sentiment, dtype: int64

In [18]:
train_df.head()

Unnamed: 0,review,sentiment
2451,I have to agree with the previous author's com...,1
29907,Despite an overall pleasing plot and expensive...,0
14936,'Fame' (1980) is brilliant. It's got all these...,1
25058,This is a delightful film. Elizabeth Taylor do...,1
10401,"I believe there are two angles to the story, f...",1


In [20]:
train_df['review'] = train_df['review'].apply(lambda x: x.lower())

In [22]:
# Remove Html Tags
def remove_tags(text):
    p = re.compile(r'<.*?>')
    return p.sub('', text)
train_df['review'] = train_df['review'].apply(lambda x: remove_tags(x))

In [23]:
# Remove links:
def remove_url(txt):
    return re.sub(r'\s*https?://\S+(\s+|$)', '', txt, flags=re.MULTILINE)
train_df['review'] = train_df['review'].apply(lambda x: remove_url(x))


In [25]:
 # Remove hash tag:
def remove_hashtag(txt):
    return re.sub(r'@[A-Za-z0-9]+', '', txt, flags=re.MULTILINE)
train_df['review'] = train_df['review'].apply(lambda x: remove_hashtag(x))

In [26]:
# Remove punctuation
def remove_punc(txt):
    return txt.translate(str.maketrans('', '', string.punctuation))
train_df['review'] = train_df['review'].apply(lambda x: remove_punc(x))

In [27]:
# Remove Special Char
def remove_spchar(text):
    return re.sub('\W+',' ', text)
train_df['review'] = train_df['review'].apply(lambda x: remove_spchar(x))

In [28]:
#Remove Non- ascii characters
def remove_nonascii(txt):
    encoded_string = txt.encode("ascii", "ignore")
    return encoded_string.decode()
train_df['review'] = train_df['review'].apply(lambda x: remove_nonascii(x))

In [None]:
#Remove Stopwords
def remove_stopwords(txt):
    data = txt.split()
    new = [word for word in data if not word in stopwords.words('english')]
    return " ".join(new)
train_df['review'] = train_df['review'].apply(lambda x: remove_stopwords(x))

In [30]:
# Applying lemmatization
def lematize(text):
    wnl = WordNetLemmatizer()
    return " ".join([wnl.lemmatize(i,pos='v') for i in text.split()])
train_df['review'] = train_df['review'].apply(lambda x: lematize(x))

In [35]:
train_df=train_df[train_df['review'].notna()]

In [36]:
train_df.shape

(40000, 2)

In [38]:
def review_preprocess(review):
    """
    Takes in a string of review, then performs the following:
    1. Remove HTML tag from review
    2. Remove URLs from review
    3. Make entire review lowercase
    4. Split the review in words
    5. Remove all punctuation
    6. Remove empty strings from review
    7. Remove all stopwords
    8. Returns a list of the cleaned review after jioning them back to a sentence
    """
    en_stops = set(stopwords.words('english'))
    
    """
    Removing HTML tag from review
    """
    clean = re.compile('<.*?>')
    review_without_tag = re.sub(clean, '', review) 
    
    
    """
    Removing URLs
    """
    review_without_tag_and_url = re.sub(r"http\S+", "", review_without_tag)
    
    review_without_tag_and_url = re.sub(r"www\S+", "", review_without_tag)
    
    """
    Make entire string lowercase
    """
    review_lowercase = review_without_tag_and_url.lower()
    
    """
    Split string into words
    """
    list_of_words = word_tokenize(review_lowercase)
    
    
    """
    Remove punctuation
    Checking characters to see if they are in punctuation
    """

    list_of_words_without_punctuation=[''.join(this_char for this_char in this_string if (this_char in string.ascii_lowercase))for this_string in list_of_words]
     
    
    """
    Remove empty strings
    """
    list_of_words_without_punctuation = list(filter(None, list_of_words_without_punctuation))
    
    
    """
    Remove any stopwords
    """
  
    filtered_word_list = [w for w in list_of_words_without_punctuation if w not in en_stops] 
    
    """
    Returns a list of the cleaned review after jioning them back to a sentence
    """
    return ' '.join(filtered_word_list)


In [51]:
def remove_tags(text):
    p = re.compile(r'<.*?>')
    return p.sub('', text)

def remove_url(txt):
    return re.sub(r'\s*https?://\S+(\s+|$)', '', txt, flags=re.MULTILINE)

def remove_hashtag(txt):
    return re.sub(r'@[A-Za-z0-9]+', '', txt, flags=re.MULTILINE)

def remove_punc(txt):
    return txt.translate(str.maketrans('', '', string.punctuation))

def remove_spchar(text):
    return re.sub('\W+',' ', text)

def remove_nonascii(txt):
    encoded_string = txt.encode("ascii", "ignore")
    return encoded_string.decode()

def remove_stopwords(txt):
    data = txt.split()
    new = [word for word in data if not word in stopwords.words('english')]
    return " ".join(new)

def lematize(text):
    wnl = WordNetLemmatizer()
    return " ".join([wnl.lemmatize(i,pos='v') for i in text.split()])

def preprocess_df(df):
    print(df.shape)

    # Lower all caps
    df['review'] = df['review'].apply(lambda x: x.lower())

    # Remove Html Tags
    df['review'] = df['review'].apply(lambda x: remove_tags(x))

    # Remove links:
    df['review'] = df['review'].apply(lambda x: remove_url(x))

    # Remove hash tag:
    df['review'] = df['review'].apply(lambda x: remove_hashtag(x))

    # Remove punctuation
    df['review'] = df['review'].apply(lambda x: remove_punc(x))

    # Remove Special Char
    df['review'] = df['review'].apply(lambda x: remove_spchar(x))

    #Remove Non- ascii characters
    df['review'] = df['review'].apply(lambda x: remove_nonascii(x))

    #Remove Stopwords
    #df['review'] = df['review'].apply(lambda x: remove_stopwords(x))
    
    # Applying lemmatization
    df['review'] = df['review'].apply(lambda x: lematize(x))

    df = df[df['review'].notna()]
    print(df.shape)

    return df

In [52]:
train_df_cleaned = preprocess_df(train_df)
train_df_cleaned.head()

(40000, 2)
(40000, 2)


Unnamed: 0,review,sentiment
2451,i have to agree with the previous author comme...,1
29907,despite an overall please plot and expensive p...,0
14936,fame 1980 be brilliant its get all these quali...,1
25058,this be a delightful film elizabeth taylor do ...,1
10401,i believe there be two angle to the story firs...,1


In [54]:
unique = set(train_df_cleaned['review'].str.replace('[^a-zA-Z ]', '').str.lower().str.split(' ').sum())

  unique = set(train_df_cleaned['review'].str.replace('[^a-zA-Z ]', '').str.lower().str.split(' ').sum())


In [None]:
unique

In [None]:
" ".join(train_df_cleaned['review']).split()

In [None]:
list(train_df_cleaned['review'].str.split(' ', expand=True).stack().unique())

In [None]:
vocabulary = {}
for word in distinct_words:
    vocabulary[word] = df[df['Text'].str.contains(word)].index.tolist()

In [44]:
def get_data(df, vocab):
    review_dict={'neg':[],'pos':[]}
    for label_type in [0, 1]: 
        clean_df=review_preprocess(df)
        if label_type == 'neg':
            review_dict['neg'].append(clean_review)
        else:
            review_dict['pos'].append(clean_review)
        # Update counts
        vocab.update(clean_review.split())
                        
    return review_dict

In [48]:
df = pd.read_csv(input_path)
label_map={'positive': 1, 'negative': 0}

train_df, test_df = train_test_split(df, test_size=0.2, random_state=125)
train_df.shape,test_df.shape

((40000, 2), (10000, 2))

In [49]:
train_df['sentiment'] = train_df['sentiment'].map(label_map)
train_df['sentiment'].value_counts()

1    20007
0    19993
Name: sentiment, dtype: int64

In [50]:
test_df['sentiment'] = test_df['sentiment'].map(label_map)
test_df['sentiment'].value_counts()

0    5007
1    4993
Name: sentiment, dtype: int64

In [43]:
train_review = train_df['review'].tolist()
test_review = test_df['review'].tolist()
train_review[0:2]

["I have to agree with the previous author's comments about the excellent performances and plot. Started watching this movie by accident...(lazy Sunday afternoon clicking channels to see if anything good was on)...and was mesmerized by Martin Sheen and Emilio Estevez. Wow! Gut wrenching! Kudos to everyone (have always admired Martin Sheen) but was particularly impressed with Emilio! Excellent job of acting and directing...simply superb! So why have I never heard of this movie before? I'll have to spread the news.",
 'Despite an overall pleasing plot and expensive production one wonders how a director can make so many clumsy cultural mistakes. Where were the Japanese wardrobe and cultural consultants? Not on the payroll apparently. <br /><br />A Japanese friend of mine actually laughed out loud at some of the cultural absurdities she watched unfold before her eyes. In a later conversation she said, "Imagine a Finnish director making a movie in Fnnish about the American Civil War using b

In [45]:
vocab = Counter()
train_review_dict=get_data(train_review, vocab)
test_review_dict=get_data(test_review, vocab)

TypeError: expected string or bytes-like object