In [19]:
import os
import pandas as pd

folder = r"C:\Users\User\Desktop\IMDb_Scraper_Project"
files = [f for f in os.listdir(folder) if f.endswith(".csv")]

dataframes = []
for file in files:
    df = pd.read_csv(os.path.join(folder, file))
    df["Genre_Source"] = file.replace(".csv", "")
    dataframes.append(df)

merged_df = pd.concat(dataframes, ignore_index=True)
merged_df.to_csv("merged_imdb_data.csv", index=False, encoding="utf-8")

print("Merging complete! Total movies:", len(merged_df))
print(merged_df.head())


Merging complete! Total movies: 27144
                                               Title   Genre  Rating    Votes  \
0                                  1. Dune: Part Two  Action     8.5   (609K)   
1                           2. Venom: The Last Dance  Action     6.0   (104K)   
2                                    3. Gladiator II  Action     6.6   (213K)   
3                            4. Sonic the Hedgehog 3  Action     6.9    (51K)   
4  5. The Lord of the Rings: The War of the Rohirrim  Action     6.3    (27K)   

  Duration Genre_Source  
0   2h 46m       Action  
1   1h 50m       Action  
2   2h 28m       Action  
3   1h 50m       Action  
4   2h 14m       Action  


In [None]:
import pandas as pd

df = pd.read_csv("merged_imdb_data.csv")

df.drop(columns=["Genre_Source"], inplace=True)

df.to_csv("merged_movies_cleaned.csv", index=False)

print("'Genre_Source' column removed successfully!")


✅ 'Genre_Source' column removed successfully!


In [None]:
import pandas as pd

df = pd.read_csv("merged_movies_cleaned.csv")

df.rename(columns={
    "Title": "Movie Name",
    "Genre": "Genre",
    "Rating": "Ratings",
    "Votes": "Voting Counts",
    "Duration": "Duration"
}, inplace=True)

df.to_csv("merged_movies_cleaned.csv", index=False)

print(" Column names updated successfully!")


✅ Column names updated successfully!


In [None]:
import pandas as pd
import re

# Function to convert duration format
def convert_duration(duration):
    match = re.match(r'(?:(\d+)h)?\s*(?:(\d+)m)?', duration)
    if match:
        hours = int(match.group(1)) if match.group(1) else 0
        minutes = int(match.group(2)) if match.group(2) else 0
        return hours * 60 + minutes
    return "N/A"

# Load your CSV file
df = pd.read_csv("merged_movies_cleaned.csv")

# Apply duration conversion
df["Duration"] = df["Duration"].astype(str).apply(convert_duration)

# Rename the column
df.rename(columns={"Duration": "Duration (min)"}, inplace=True)

# Save the cleaned file
df.to_csv("merged_movies_cleaned.csv", index=False)

print(" Duration converted successfully!")


✅ Duration converted successfully!


In [None]:
import pandas as pd

# Load your CSV file
df = pd.read_csv("merged_movies_cleaned.csv")

# Replace 0 with "N/A" in the "Duration (min)" column
df["Duration (min)"] = df["Duration (min)"].replace(0, "N/A")

# Save the cleaned file
df.to_csv("merged_movies_cleaned.csv", index=False)

print(" Replaced 0 with N/A in Duration (min) column!")


✅ Replaced 0 with N/A in Duration (min) column!


In [8]:
import pandas as pd
import re

df = pd.read_csv("merged_movies_cleaned.csv")

def convert_votes(vote):
    if pd.isna(vote) or vote == "" or vote.strip() == "N/A":  
        return "N/A"
    
    if isinstance(vote, str):
        vote = vote.lower().replace(",", "").strip()
        vote = re.sub(r"[()]", "", vote)  # Remove parentheses if present
        
        if "k" in vote:
            return int(float(vote.replace("k", "")) * 1_000)
        elif "m" in vote:
            return int(float(vote.replace("m", "")) * 1_000_000)
        elif vote.isdigit():  # If it's a pure number, convert it directly
            return int(vote)
    
    return "N/A"  # If it doesn't match any condition, mark it as N/A

df["Voting Counts"] = df["Voting Counts"].apply(convert_votes)

df.to_csv("merged_movies_cleaned.csv", index=False)

print("✅ Voting Counts cleaned successfully!")



✅ Voting Counts cleaned successfully!


In [None]:
import pandas as pd  
 
df = pd.read_csv("merged_movies_cleaned.csv")  
 
if "Duration (min)" in df.columns:
    df["Duration (min)"] = df["Duration (min)"].apply(lambda x: x if x == "N/A" else int(float(x)) if pd.notna(x) else "N/A")
  
df.to_csv("merged_movies_cleaned.csv", index=False)  

print("✅ Duration (min) column cleaned successfully!")



✅ Duration (min) column cleaned successfully!


In [None]:
import pandas as pd  

df = pd.read_csv("merged_movies_cleaned.csv")  
  
if "Voting Counts" in df.columns:
    df["Voting Counts"] = df["Voting Counts"].replace(["", " "], "N/A").fillna("N/A")
  
df.to_csv("merged_movies_cleaned.csv", index=False)  

print("Replaced empty values in Voting Counts with N/A successfully!")


✅ Replaced empty values in Voting Counts with N/A successfully!


In [None]:
import pandas as pd  
 
df = pd.read_csv("merged_movies_cleaned.csv", dtype=str)   
  
df.fillna("N/A", inplace=True)  
df.replace("", "N/A", inplace=True)  
  
df.to_csv("merged_movies_cleaned.csv", index=False)  

print(" Replaced all empty values with N/A successfully!")


✅ Replaced all empty values with N/A successfully!


In [None]:
import pandas as pd  
  
df = pd.read_csv("merged_movies_cleaned.csv", dtype=str)  
 
def convert_to_int(value):
    if value == "N/A":
        return value  
    try:
        return int(float(value))  
    except ValueError:
        return "N/A"  
 
if "Duration (min)" in df.columns:
    df["Duration (min)"] = df["Duration (min)"].apply(convert_to_int)  

if "Voting Counts" in df.columns:
    df["Voting Counts"] = df["Voting Counts"].apply(convert_to_int)  
 
df.to_csv("merged_movies_cleaned.csv", index=False)  

print("✅ Converted 'Duration (min)' and 'Voting Counts' to integers (except N/A) successfully!")


✅ Converted 'Duration (min)' and 'Voting Counts' to integers (except N/A) successfully!


In [None]:
import pandas as pd  

df = pd.read_csv("merged_movies_cleaned.csv", dtype=str)  

if "Ratings" in df.columns:
    df["Ratings"] = df["Ratings"].replace("", "N/A")  
  
df.to_csv("merged_movies_cleaned.csv", index=False)  

print(" Replaced empty values in 'Ratings' with 'N/A' successfully!")


✅ Replaced empty values in 'Ratings' with 'N/A' successfully!


In [None]:
import pandas as pd  
import re  
 
df = pd.read_csv("merged_movies_cleaned.csv", dtype=str)  
 
def clean_movie_name(name):
    return re.sub(r"^\d+\.\s*", "", name).strip() if pd.notna(name) else name  
 
if "Movie Name" in df.columns:
    df["Movie Name"] = df["Movie Name"].apply(clean_movie_name)  
 
df.to_csv("merged_movies_cleaned.csv", index=False)  

print("Removed leading numbers and dots from 'Movie Name' successfully!")



✅ Removed leading numbers and dots from 'Movie Name' successfully!


In [None]:
import pandas as pd

csv_file = "imdb_data_cleaned_2024.csv"  

try:
    df = pd.read_csv(csv_file)
    print("CSV loaded successfully!")
    print(df.head())  
except Exception as e:
    print(f"Error reading CSV: {e}")

✅ CSV loaded successfully!
                            Movie Name                      Genre  Ratings  \
0                                1 Fan                     Comedy      6.7   
1                                  AAY                     Comedy      6.4   
2       AMFAD: All My Friends Are Dead  Mystery, Horror, Thriller      4.5   
3  Comicsgate - How to Kill a Movement                Documentary      6.7   
4                            FamilyMan                      Drama      6.7   

   Voting Counts  Duration  
0              0        94  
1            785       142  
2           1200        91  
3              0        94  
4              0        94  
