<a href="https://colab.research.google.com/github/missmisspa/BDA/blob/main/data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive (if needed)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)  # Ensures remounting if already mounted

# Import necessary libraries
import pandas as pd
import numpy as np
from google.colab import files  # Ensure file download works

# Load dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/dataset/movies.csv"  # Update if needed

try:
    df = pd.read_csv(file_path)
    print("✅ Dataset loaded successfully!")
except FileNotFoundError:
    print(f"❌ File not found: {file_path}. Please check the path.")
    raise

# Standardize column names
df.columns = df.columns.str.lower().str.strip()

# Clean 'year' column (convert to string before extracting numbers)
df['year'] = df['year'].astype(str).str.extract('(\d{4})').astype('Int64')

# Clean 'genre' column (remove leading/trailing whitespace and newline characters)
df['genre'] = df['genre'].astype(str).str.strip().str.replace('\n', '', regex=True)

# Convert 'votes' column to numeric (remove commas, handle NaNs safely)
df['votes'] = pd.to_numeric(df['votes'].astype(str).str.replace(',', '', regex=True), errors='coerce').astype('Int64')

# Clean 'gross' column (convert to float, handle missing values)
df['gross'] = df['gross'].astype(str).str.replace(r'[$,]', '', regex=True)  # Remove $ and commas
df['gross'] = df['gross'].str.replace(r'M', 'e6', regex=True)  # Convert 'M' to millions
df['gross'] = pd.to_numeric(df['gross'], errors='coerce')  # Convert to float

# Clean 'stars' column (convert to string before removing newlines)
df['stars'] = df['stars'].astype(str).str.replace('\n', ' ', regex=True).str.strip()

# Drop duplicates
df.drop_duplicates(inplace=True)

# Fill missing values safely (check column existence)
if 'rating' in df.columns and df['rating'].notna().any():
    df['rating'] = df['rating'].fillna(df['rating'].median())

if 'runtime' in df.columns and df['runtime'].notna().any():
    df['runtime'] = df['runtime'].fillna(df['runtime'].median())

# Save cleaned dataset
cleaned_file_name = "movies_cleaned.csv"
df.to_csv(cleaned_file_name, index=False)

# Provide download link
files.download(cleaned_file_name)

print("✅ Cleaning complete! Download your cleaned file from the link above.")


Mounted at /content/drive
✅ Dataset loaded successfully!


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

✅ Cleaning complete! Download your cleaned file from the link above.
