In [64]:
import pandas as pd

# Load the CSV files
df2 = pd.read_csv("./original_datasets/imdb.csv", low_memory=False)

# Length of the dataset
print("Length of imdb.csv: ", len(df2))


Length of imdb.csv:  45466


In [65]:
# View null in imdb_id
print("Number of null records in imdb_id: ", df2['imdb_id'].isnull().sum())

# Remove Null values
df2 = df2.dropna(subset=['imdb_id'])

print("Length of imdb.csv after remove the null values in imdb_id: ", len(df2))

Number of null records in imdb_id:  17
Length of imdb.csv after remove the null values in imdb_id:  45449


In [66]:
# View duplicates in imdb_id
print("Number of duplicate records in imdb_id: ", len(df2[df2['imdb_id'].duplicated()]))

# Remove Duplicates
df2 = df2.drop_duplicates(subset='imdb_id', keep='first')

print("Length of imdb.csv after remove the duplicate values in imdb_id: ", len(df2))

Number of duplicate records in imdb_id:  32
Length of imdb.csv after remove the duplicate values in imdb_id:  45417


In [67]:
print(df2.columns)

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


In [68]:
# Extract the year from the 'release_date' column
def extract_year(value):
    if pd.isna(value):  # Handle missing values
        return None
    value = str(value)  # Convert to string
    if value.isdigit():  # If it's a year (e.g., "2024")
        return int(value)
    try:
        return pd.to_datetime(value).year  # Parse dates like "8/19/2024"
    except Exception:
        return None  # If parsing fails, return None

# Apply the function to the 'movie release date' column
df2['year'] = df2['release_date'].apply(extract_year)

In [69]:
# Null id's
print("No of null values in year column: ", df2['year'].isnull().sum())
print("No of null values in original_title column: ", df2['original_title'].isnull().sum())

# Remove null vales
df2 = df2.dropna(subset=['year'])
df2 = df2.dropna(subset=['original_title'])

# Length of the dataset
print("Length of imdb.csv after remove the null values in year and original_title:", len(df2))

No of null values in year column:  84
No of null values in original_title column:  0
Length of imdb.csv after remove the null values in year and original_title: 45333


In [70]:
print("Duplicate records with the same year and original_title: " ,len(df2[df2.duplicated(subset=['original_title', 'year'], keep=False)]))

Duplicate records with the same year and original_title:  42


In [71]:
# Remove rows where duplicates exist based on both columns
df2 = df2[~df2.duplicated(subset=['original_title', 'year'], keep=False)]

print("Length ofimdb.csv after remove all duplicates records where year and original_title same: ",len(df2))

Length ofimdb.csv after remove all duplicates records where year and original_title same:  45291


In [72]:
df2 = df2.drop(columns=['adult',
       'belongs_to_collection', 'budget', 'homepage', 'id','popularity', 'production_companies', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count'])

df2.columns

Index(['genres', 'imdb_id', 'original_language', 'original_title', 'overview',
       'poster_path', 'production_countries', 'release_date', 'year'],
      dtype='object')

In [74]:
df2.to_csv("./created_datasets/imdb_clean.csv", index=False)