# Data Import

In [4]:
import pandas as pd

In [5]:
# Load the IMDb dataset
df = pd.read_csv('/home/minjoo/minjoo/Unstructured-Data-Analysis/Week 7 - Dimensionality Reduction/data/IMDB Dataset.csv')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [7]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [41]:
df = df[:50000]

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     1000 non-null   object
 1   sentiment  1000 non-null   object
dtypes: object(2)
memory usage: 15.8+ KB


In [28]:
# Separate positive and negative reviews
positive_reviews = df[df['sentiment'] == 'positive']['review'].tolist()
negative_reviews = df[df['sentiment'] == 'negative']['review'].tolist()

In [29]:
positive_reviews[:5]

["One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the f

In [30]:
negative_reviews[:5]

["Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them.",
 "This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 or 8 years were brilliant, but things dropped off after that. By 1990, the show was not really funny anymore, and it's continued its decline further t

# Data Pre-processing

In [31]:
# "<br>" is an HTML tag for line breaks, which we want to remove during text analysis.
# To remove specific words like <br> from HTML tags, we can use regular expressions (Regex).
# example
import re

# Function to remove <br> tags from text
def remove_br_tags(text):
    # Use regex to match <br> and remove it
    return re.sub(r'<br\s*/?>', '', text)

# Example text with <br> tags
example_text = "This is a review.<br> It contains HTML <br> tags.<br/>Let's remove them."

# Apply the function to remove <br> tags
cleaned_text = remove_br_tags(example_text)
print(cleaned_text)

This is a review. It contains HTML  tags.Let's remove them.


In [32]:
# Function to remove <br> tags from text
def remove_br_tags(text):
    return re.sub(r'<br\s*/?>', '', text)

In [36]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [37]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/minjoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/minjoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/minjoo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [38]:
# Define the detailed text preprocessing function
def detailed_preprocess(text):
    # 1. Remove <br> tags
    text = remove_br_tags(text)

    # 2. Remove special characters and numbers: Keep only alphabetic words
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # 3. Lowercase the text
    text = text.lower()

    # 4. Tokenization: Split the text into words (tokens)
    tokens = word_tokenize(text)

    # 5. Remove stopwords: Filter out common words that do not add much value
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # 6. Lemmatization: Reduce words to their base form (lemma)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Return the cleaned and processed tokens as a single string for TF-IDF Vectorizer
    return ' '.join(tokens)

In [39]:
# Apply the detailed preprocessing function to all positive and negative reviews
preprocessed_pos_reviews = [detailed_preprocess(doc) for doc in positive_reviews]
preprocessed_neg_reviews = [detailed_preprocess(doc) for doc in negative_reviews]

# Text Representation

In [None]:
# TF-IDF Vectorization for positive reviews
tfidf_vectorizer_pos = TfidfVectorizer(max_features=1000)
X_tfidf_pos = tfidf_vectorizer_pos.fit_transform(preprocessed_pos_reviews)

In [None]:
# TF-IDF Vectorization for negative reviews
tfidf_vectorizer_neg = TfidfVectorizer(max_features=1000)
X_tfidf_neg = tfidf_vectorizer_neg.fit_transform(preprocessed_neg_reviews)