Use the following dataset - https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

In [109]:
import pandas as pd
import numpy as np
from string import punctuation

In [110]:
# Problem 1

# Apply all the preprocessing techniques that you think are necessary
# loading first 1000 words as doing task on all 50K rows will take alot time
df = pd.read_csv("IMDB Dataset.csv").iloc[:1000,:]

### Pre-processing steps

1. Removing duplicates
2. text lower
3. remove html tags
4. removing urls
5. expanding abbreviations
6. removing special characters
7. spelling corrections
8. removing stopwards
9. removing punctuations


In [111]:
#remove duplicates
print('no. of rows before duplicate removal',df.shape[0])
df.drop_duplicates(subset='review',inplace=True)
print('no. of rows after duplicate removal',df.shape[0])

no. of rows before duplicate removal 1000
no. of rows after duplicate removal 1000


In [112]:
# convert text into lower
df['review'] = df['review'].str.lower()

In [113]:
# removing html tags and urls
df['review'] = df['review'].str.replace(pat=r'<.*?>',repl='',regex=True)

In [114]:
# removing urls
text = ['https://www.monstershub.com','http://www.iplintelligence.com','https://ipl.com']

import re

for url in text:
    print(re.findall(pattern=r'https?://\S+|www\.\S+', string=url))

['https://www.monstershub.com']
['http://www.iplintelligence.com']
['https://ipl.com']


In [115]:
df['review'] = df['review'].str.replace(pat=r'https?://\S+|www\.\S+', repl='', regex=True)

In [116]:
# removing abbreviations

# expanding abbreviations

def remove_abbreviations(data):
    data = re.sub(r"he's", "he is", data)
    data = re.sub(r"there's", "there is", data)
    data = re.sub(r"We're", "We are", data)
    data = re.sub(r"That's", "That is", data)
    data = re.sub(r"won't", "will not", data)
    data = re.sub(r"they're", "they are", data)
    data = re.sub(r"Can't", "Cannot", data)
    data = re.sub(r"wasn't", "was not", data)
    data = re.sub(r"don\x89Ûªt", "do not", data)
    data= re.sub(r"aren't", "are not", data)
    data = re.sub(r"isn't", "is not", data)
    data = re.sub(r"What's", "What is", data)
    data = re.sub(r"haven't", "have not", data)
    data = re.sub(r"hasn't", "has not", data)
    data = re.sub(r"There's", "There is", data)
    data = re.sub(r"He's", "He is", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"You're", "You are", data)
    data = re.sub(r"I'M", "I am", data)
    data = re.sub(r"shouldn't", "should not", data)
    data = re.sub(r"wouldn't", "would not", data)
    data = re.sub(r"i'm", "I am", data)
    data = re.sub(r"I\x89Ûªm", "I am", data)
    data = re.sub(r"I'm", "I am", data)
    data = re.sub(r"Isn't", "is not", data)
    data = re.sub(r"Here's", "Here is", data)
    data = re.sub(r"you've", "you have", data)
    data = re.sub(r"you\x89Ûªve", "you have", data)
    data = re.sub(r"we're", "we are", data)
    data = re.sub(r"what's", "what is", data)
    data = re.sub(r"couldn't", "could not", data)
    data = re.sub(r"we've", "we have", data)
    data = re.sub(r"it\x89Ûªs", "it is", data)
    data = re.sub(r"doesn\x89Ûªt", "does not", data)
    data = re.sub(r"It\x89Ûªs", "It is", data)
    data = re.sub(r"Here\x89Ûªs", "Here is", data)
    data = re.sub(r"who's", "who is", data)
    data = re.sub(r"I\x89Ûªve", "I have", data)
    data = re.sub(r"y'all", "you all", data)
    data = re.sub(r"can\x89Ûªt", "cannot", data)
    data = re.sub(r"would've", "would have", data)
    data = re.sub(r"it'll", "it will", data)
    data = re.sub(r"we'll", "we will", data)
    data = re.sub(r"wouldn\x89Ûªt", "would not", data)
    data = re.sub(r"We've", "We have", data)
    data = re.sub(r"he'll", "he will", data)
    data = re.sub(r"Y'all", "You all", data)
    data = re.sub(r"Weren't", "Were not", data)
    data = re.sub(r"Didn't", "Did not", data)
    data = re.sub(r"they'll", "they will", data)
    data = re.sub(r"they'd", "they would", data)
    data = re.sub(r"DON'T", "DO NOT", data)
    data = re.sub(r"That\x89Ûªs", "That is", data)
    data = re.sub(r"they've", "they have", data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"should've", "should have", data)
    data = re.sub(r"You\x89Ûªre", "You are", data)
    data = re.sub(r"where's", "where is", data)
    data = re.sub(r"Don\x89Ûªt", "Do not", data)
    data = re.sub(r"we'd", "we would", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"weren't", "were not", data)
    data = re.sub(r"They're", "They are", data)
    data = re.sub(r"Can\x89Ûªt", "Cannot", data)
    data = re.sub(r"you\x89Ûªll", "you will", data)
    data = re.sub(r"I\x89Ûªd", "I would", data)
    data = re.sub(r"let's", "let us", data)
    data = re.sub(r"it's", "it is", data)
    data = re.sub(r"can't", "cannot", data)
    data = re.sub(r"don't", "do not", data)
    data = re.sub(r"you're", "you are", data)
    data = re.sub(r"i've", "I have", data)
    data = re.sub(r"that's", "that is", data)
    data = re.sub(r"i'll", "I will", data)
    data = re.sub(r"doesn't", "does not",data)
    data = re.sub(r"i'd", "I would", data)
    data = re.sub(r"didn't", "did not", data)
    data = re.sub(r"ain't", "am not", data)
    data = re.sub(r"you'll", "you will", data)
    data = re.sub(r"I've", "I have", data)
    data = re.sub(r"Don't", "do not", data)
    data = re.sub(r"I'll", "I will", data)
    data = re.sub(r"I'd", "I would", data)
    data = re.sub(r"Let's", "Let us", data)
    data = re.sub(r"you'd", "You would", data)
    data = re.sub(r"It's", "It is", data)
    data = re.sub(r"Ain't", "am not", data)
    data = re.sub(r"Haven't", "Have not", data)
    data = re.sub(r"Could've", "Could have", data)
    data = re.sub(r"youve", "you have", data)  
    data = re.sub(r"donå«t", "do not", data)
    
    return data


In [117]:
df['review'] = df['review'].apply(remove_abbreviations)

In [118]:
#spelling correction
from textblob import TextBlob

def spell_correct(text):
    
    return TextBlob(text).correct().string

In [119]:
# df['review'].apply(spell_correct)
# took more than hour and still in processing, therefore commented it out as of now

In [120]:
# removing punctuations
from string import punctuation
def remove_puctuation(text):
    return text.translate(str.maketrans("","",punctuation)) #this method is 20 times faster then above for loop

In [121]:
df['review'] = df['review'].apply(remove_puctuation)

In [122]:
# remove stopwords
from nltk.corpus import stopwords
def remove_stopwords(text):
    words = []
    
    for word in text.split(' '):
        if word not in stopwords.words('english'):
            words.append(word)
    
    return ' '.join(words)

In [123]:
df['review'].apply(remove_stopwords)

0      one reviewers mentioned watching 1 oz episode ...
1      wonderful little production filming technique ...
2      thought wonderful way spend time hot summer we...
3      basically family little boy jake thinks zombie...
4      petter matteis love time money visually stunni...
                             ...                        
995    nothing sacred ask ernie fosselius days everyb...
996    hated hate selfaware pretentious inanity masqu...
997    usually try professional constructive criticiz...
998    like going see film history class something li...
999    like zoology textbook given depiction animals ...
Name: review, Length: 1000, dtype: object

In [132]:
# Problem 2
# Find out the number of words in the entire corpus and also the total number of unique words(vocabulary) using just python

corpus = pd.read_csv('IMDB Dataset.csv').drop_duplicates(subset='review',keep='first')['review'].to_string()

In [133]:
print('len of corpus : ',len(corpus))
vocabulary = set(corpus.split(' '))
print('vocabulary size : ',len(vocabulary))

len of corpus :  2925337
vocabulary size :  88294


In [136]:
 # Problem 3
# Apply One Hot Encoding
_ = pd.get_dummies(df, columns=['review'],drop_first=True)
print(_)

    sentiment  \
0    positive   
1    positive   
2    positive   
3    negative   
4    positive   
..        ...   
995  positive   
996  negative   
997  negative   
998  negative   
999  negative   

     review_1979s tourist trap is a clever unique b thriller that stands out as one of the best of it is kindtravellers stop at a lonely wax museum where the owners mannequins are a little too lifelike for comfortwhile the film has hints of the texas chainsaw massacre tourist trap is mainly a creepy psychological thriller worthy of the twilight zone director david schmoeller gives this movie an atmosphere of darkness and mystery that reaches nightmarish proportions also schmoelloer adds the occasional touch of comic relief to the bizarre happeningsventeran actor chuck connors is the best of the films decent cast pino donaggios music score is excellent having both lyrical and solemn themes that are perfect to the movie a number of the films sequences are quite memorablefor horror and t

In [141]:
# Problem 4
# Apply bag words and find the vocabulary also find the times each word has occured
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()

# Fit and transform the 'review' column
bow_matrix = vectorizer.fit_transform(df['review'])

# Create a DataFrame for Bag of Words matrix
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=vectorizer.get_feature_names_out())

# Print the transformed BoW DataFrame

# If you want to concatenate the BoW matrix with the original DataFrame:
result_df = pd.concat([df, bow_df], axis=1)
result_df

Unnamed: 0,review,sentiment,007,02,0510,10,100,1000,10000,100000,...,zooming,zooms,zp,zu,zucker,zulu,zwick,zzzzzzzzzzzzzzzzzz,élan,ísnt
0,one of the other reviewers has mentioned that ...,positive,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,a wonderful little production the filming tech...,positive,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,i thought this was a wonderful way to spend ti...,positive,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,basically there is a family where a little boy...,negative,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,petter matteis love in the time of money is a ...,positive,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,nothing is sacred just ask ernie fosselius the...,positive,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,i hated it i hate selfaware pretentious inanit...,negative,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,i usually try to be professional and construct...,negative,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,if you like me is going to see this in a film ...,negative,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
df

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production the filming tech...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there is a family where a little boy...,negative
4,petter matteis love in the time of money is a ...,positive
...,...,...
995,nothing is sacred just ask ernie fosselius the...,positive
996,i hated it i hate selfaware pretentious inanit...,negative
997,i usually try to be professional and construct...,negative
998,if you like me is going to see this in a film ...,negative


In [None]:
# Problem 5

# Apply bag of bi-gram and bag of tri-gram and write down your observation about the dimensionality of the vocabulary

In [None]:
# Problem 6
# Apply tf-idf and find out the idf scores of words, also find out the vocabulary.
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the 'review' column
tfidf_matrix = tfidf_vectorizer.fit_transform(df['review'])

# Convert the sparse matrix to a DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Print the TF-IDF DataFrame
print(tfidf_df)

# Concatenate the TF-IDF matrix with the original DataFrame (optional)
result_df = pd.concat([df, tfidf_df], axis=1)
result_df

AttributeError: 'TfidfVectorizer' object has no attribute 'toarray'

In [147]:
tf_idf_ = TfidfVectorizer()

tf_idf_.fit(df['review'])

print('idf values : ',tf_idf_.idf_)
print('vocabulary : ',tf_idf_.vocabulary_)

idf values :  [7.2156076 7.2156076 7.2156076 ... 7.2156076 7.2156076 7.2156076]
