In [58]:
import pandas as pd

movies_df= pd.read_csv("MY_omdb_movie_dataset.csv")

In [62]:
movies_df.drop(columns=['Dominant Topic','emotion_scores','emotion_trust_normalized','emotion_joy_normalized','emotion_positive_normalized','emotion_positive_normalized','emotion_anger_normalized','emotion_disgust_normalized','emotion_fear_normalized','emotion_negative_normalized','emotion_sadness_normalized','emotion_anticipation_normalized','emotion_surprise_normalized'], inplace=True)
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1496 entries, 0 to 1495
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               1496 non-null   object 
 1   Year                1496 non-null   object 
 2   Rated               1482 non-null   object 
 3   Released            1494 non-null   object 
 4   Runtime             1490 non-null   object 
 5   Genre               1496 non-null   object 
 6   Director            1481 non-null   object 
 7   Actors              1494 non-null   object 
 8   Plot                1494 non-null   object 
 9   IMDb Rating         1494 non-null   float64
 10  Sentiment           1496 non-null   object 
 11  Sentiment Score     1496 non-null   float64
 12  Extracted_Keywords  1494 non-null   object 
 13  Processed_Plot      1494 non-null   object 
dtypes: float64(2), object(12)
memory usage: 163.8+ KB


### STEP 1 : PREPROCESSING THE TEXT COLUMNS

### Plot, Processed_Plot, Extracted_Keywords have have already been processed while creating the dataser, therefore these columns have been skipped in this step

In [64]:
import pandas as pd
import re

#FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str): #CHECKS IF THE INPUT IS A STRING
        # LOWERCASING
        text = text.lower()
        # TRIMMING WHITESPACE
        text = text.strip()
    return text

# APPLYING PREPROCESSING TO THE TEXT COLUMNS
movies_df['Title'] = movies_df['Title'].apply(preprocess_text)
movies_df['Rated'] = movies_df['Rated'].apply(preprocess_text)
movies_df['Genre'] = movies_df['Genre'].apply(preprocess_text)
movies_df['Director'] = movies_df['Director'].apply(preprocess_text)
movies_df['Actors'] = movies_df['Actors'].apply(preprocess_text)
movies_df['Director'] = movies_df['Director'].apply(preprocess_text)
movies_df['Sentiment'] = movies_df['Sentiment'].apply(preprocess_text)

movies_df.to_csv('movies_preprocess.csv', index=False) 
movies_df.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Actors,Plot,IMDb Rating,Sentiment,Sentiment Score,Extracted_Keywords,Processed_Plot
0,the shawshank redemption,1994,r,14 Oct 1994,142 min,drama,frank darabont,"tim robbins, morgan freeman, bob gunton",A banker convicted of uxoricide forms a friend...,9.3,neutral,0.0,"convict, convicted, innocence, uxoricide, banker",banker convicted uxoricide form friendship qua...
1,forrest gump,1994,pg-13,06 Jul 1994,142 min,"drama, romance",robert zemeckis,"tom hanks, robin wright, gary sinise",The history of the United States from the 1950...,8.8,neutral,0.0,"alabama, reunited, iq, state, history",history united state unfolds perspective alaba...
2,schindler's list,1993,r,04 Feb 1994,195 min,"biography, drama, history",steven spielberg,"liam neeson, ralph fiennes, ben kingsley","In German-occupied Poland during World War II,...",9.0,neutral,0.0,"schindler, nazi, oskar, jewish, germanoccupied",germanoccupied poland world war ii industriali...
3,the godfather,1972,r,24 Mar 1972,175 min,"crime, drama",francis ford coppola,"marlon brando, al pacino, james caan",The aging patriarch of an organized crime dyna...,9.2,neutral,0.0,"dynasty, clandestine, empire, patriarch, crime",aging patriarch organized crime dynasty transf...
4,the green mile,1999,r,10 Dec 1999,189 min,"crime, drama, fantasy",frank darabont,"tom hanks, michael clarke duncan, david morse","A tale set on death row, where gentle giant Jo...",8.6,positive,0.066667,"coffey, tale, john, giant, death",tale set death row gentle giant john coffey po...


### STEP 2 : NORMALIZING THE NUMERICAL COLUMNS


In [66]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# INITIALIZE THE MINMAX SCALER
min_max_scaler = MinMaxScaler()

# DEFINING MY NUMERICAL COLUMNS
numerical_columns_movies = [
    "IMDb Rating", "Sentiment Score"
]

# APPLYING THE MIN-MAX SCALER TO THE NUMERICAL COLUMNS
movies_df[numerical_columns_movies] = min_max_scaler.fit_transform(movies_df[numerical_columns_movies])

# Display the normalized columns
movies_df[['IMDb Rating', 'Sentiment Score']].head()

Unnamed: 0,IMDb Rating,Sentiment Score
0,1.0,0.5
1,0.915254,0.5
2,0.949153,0.5
3,0.983051,0.5
4,0.881356,0.533333


### STEP 4 : CHECKING FOR DUPLICATES

In [68]:
# CHECKING THE NUMBER OF DUPLICATE ROWS
num_duplicates = movies_df.duplicated(subset=['Title', 'Year']).sum()

print(f"Number of duplicate rows: {num_duplicates}")

# REMOVING THE DUPLICATES
movies_df.drop_duplicates(inplace=True)
print("Duplicate rows removed.")

Number of duplicate rows: 270
Duplicate rows removed.


### STEP 5 : DROPPING ROWS WITH EMPTY "Plot" 

In [70]:
import pandas as pd

# COUNTING ROWS WITH EMPTY PLOT
empty_plot_count = movies_df[movies_df['Plot'].isna() | (movies_df['Plot'] == '')].shape[0]

# DISPLAYING THE COUNT OF EMPLTY PLOT ROWS
print(f"Number of rows with empty or NaN Plot: {empty_plot_count}")

# TOTAL NUMBER OF ROWS
print(f"Total number of rows: {len(movies_df)}")

# DROP ROWS WHERE 'Plot' IS EMPTY OR NAN
movies_df = movies_df[movies_df['Plot'].notna() & (movies_df['Plot'] != '')]

# TOTAL NUMBER OF ROWS AFTER DROPPING 
print(f"Number of rows after dropping empty plot: {len(movies_df)}")

Number of rows with empty or NaN Plot: 2
Total number of rows: 1226
Number of rows after dropping empty plot: 1224


In [72]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1224 entries, 0 to 1495
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Title               1224 non-null   object 
 1   Year                1224 non-null   object 
 2   Rated               1213 non-null   object 
 3   Released            1223 non-null   object 
 4   Runtime             1220 non-null   object 
 5   Genre               1224 non-null   object 
 6   Director            1211 non-null   object 
 7   Actors              1223 non-null   object 
 8   Plot                1224 non-null   object 
 9   IMDb Rating         1222 non-null   float64
 10  Sentiment           1224 non-null   object 
 11  Sentiment Score     1224 non-null   float64
 12  Extracted_Keywords  1224 non-null   object 
 13  Processed_Plot      1224 non-null   object 
dtypes: float64(2), object(12)
memory usage: 143.4+ KB


In [36]:
movies_df.head()

Unnamed: 0,Title,Year,Rated,Released,Runtime,Genre,Director,Actors,Plot,IMDb Rating,Sentiment,Sentiment Score,Extracted_Keywords,Processed_Plot
0,the shawshank redemption,1994,r,14 Oct 1994,142 min,drama,frank darabont,"tim robbins, morgan freeman, bob gunton",A banker convicted of uxoricide forms a friend...,1.0,neutral,0.5,"convict, convicted, innocence, uxoricide, banker",banker convicted uxoricide form friendship qua...
1,forrest gump,1994,pg-13,06 Jul 1994,142 min,"drama, romance",robert zemeckis,"tom hanks, robin wright, gary sinise",The history of the United States from the 1950...,0.915254,neutral,0.5,"alabama, reunited, iq, state, history",history united state unfolds perspective alaba...
2,schindler's list,1993,r,04 Feb 1994,195 min,"biography, drama, history",steven spielberg,"liam neeson, ralph fiennes, ben kingsley","In German-occupied Poland during World War II,...",0.949153,neutral,0.5,"schindler, nazi, oskar, jewish, germanoccupied",germanoccupied poland world war ii industriali...
3,the godfather,1972,r,24 Mar 1972,175 min,"crime, drama",francis ford coppola,"marlon brando, al pacino, james caan",The aging patriarch of an organized crime dyna...,0.983051,neutral,0.5,"dynasty, clandestine, empire, patriarch, crime",aging patriarch organized crime dynasty transf...
4,the green mile,1999,r,10 Dec 1999,189 min,"crime, drama, fantasy",frank darabont,"tom hanks, michael clarke duncan, david morse","A tale set on death row, where gentle giant Jo...",0.881356,positive,0.533333,"coffey, tale, john, giant, death",tale set death row gentle giant john coffey po...


In [38]:
movies_df.to_csv("MY_preprocessed_omdb_movie_dataset.csv",  index=False)

In [40]:
import pandas as pd

sample = movies_df
# Display the original 'Released' column for reference
print("Original Released Column:")
print(sample['Released'].head())

# Convert the 'Released' column to datetime and extract the year
sample['Released'] = pd.to_datetime(sample['Released'], errors='coerce').dt.year

# Display the updated 'Released' column
print("\nUpdated Released Column with Year:")
print(sample['Released'].head())

Original Released Column:
0    14 Oct 1994
1    06 Jul 1994
2    04 Feb 1994
3    24 Mar 1972
4    10 Dec 1999
Name: Released, dtype: object

Updated Released Column with Year:
0    1994.0
1    1994.0
2    1994.0
3    1972.0
4    1999.0
Name: Released, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['Released'] = pd.to_datetime(sample['Released'], errors='coerce').dt.year


In [42]:
import pandas as pd

my_mo = movies_df
# Display the original 'Released' column for reference
print("Original Released Column:")
print(my_mo['Released'].head())

# Convert the 'Released' column to datetime
my_mo['Released'] = pd.to_datetime(my_mo['Released'], errors='coerce')

# Check for any NaT values after conversion
print("\nNaT values after conversion:")
print(my_mo['Released'].isna().sum())

# Extract the year from the datetime and convert to integer
my_mo['Released'] = my_mo['Released'].dt.year

# Convert to integer format, preserving NaN values if necessary
my_mo['Released'] = my_mo['Released'].astype('Int64')

# Display the updated 'Released' column
print("\nUpdated Released Column with Year as Integer:")
print(my_mo['Released'].head())

Original Released Column:
0    1994.0
1    1994.0
2    1994.0
3    1972.0
4    1999.0
Name: Released, dtype: float64

NaT values after conversion:
1

Updated Released Column with Year as Integer:
0    1970
1    1970
2    1970
3    1970
4    1970
Name: Released, dtype: Int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_mo['Released'] = pd.to_datetime(my_mo['Released'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_mo['Released'] = my_mo['Released'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_mo['Released'] = my_mo['Released'].astype('Int64')
