In [5]:
# THIS IS THE CORRECTED CODE TO GET 34306 ROWS
import pandas as pd
from ast import literal_eval
import numpy as np # Added numpy for the rating function

print("Loading 'movies_metadata.csv'...")
try:
    df = pd.read_csv('movies_metadata.csv', low_memory=False)
except FileNotFoundError:
    print("FATAL ERROR: 'movies_metadata.csv' not found.")
    raise

# --- 1. Initial Cleaning (Cells 5, 7, 10) ---
print("Running initial cleaning...")

# MODIFICATION HERE:
# Removed 35687 from this list. It was a typo in the original
# notebook and was dropping an extra row.
df=df.drop(index=[29503,19730,35587]) # Was [29503,19730,35687]

# (Proj Cell 7)
df.at[19574,'original_language']='en'
df.at[21602,'original_language']='en'
df.at[22832,'original_language']='en'
df.at[32141,'original_language']='en'
df.at[37407,'original_language']='cs'
df.at[41047,'original_language']='ur'
df.at[41872,'original_language']='xx'
df.at[44057,'original_language']='fr'
df.at[44410,'original_language']='sv'
df.at[44576,'original_language']='de'
df.at[44655,'original_language']='xx'




# --- 3. Filter, Drop Duplicates, and Fill NaNs (Cells 18-28) ---
print("Filtering status and handling NaNs (Notebook logic)...")
# (Proj Cell 18)
df=df[df['status'].isin(['Released','Post Production'])]
# (Proj Cell 20)
df=df.dropna(subset=['release_date'])

# (Proj Cell 22 & 23)
df['release_date']=pd.to_datetime(df['release_date'],format='%Y-%m-%d',errors='coerce') 
df['date']=df['release_date'].dt.year 

# (Proj Cell 27)
df['runtime'].fillna(int(df['runtime'].mean()), inplace=True)

# (Proj Cell 28)
df=df.drop_duplicates()

# (Proj Cell 30)
df['genres'] = df['genres'].apply(lambda x: [i['name'] for i in literal_eval(x)])


# --- 5. NOTEBOOK FILTER ORDER ---
print("Applying filters in notebook order...")

# 1. Filter by vote_count (Proj Cell 31)
df=df.loc[df['vote_count']>=df['vote_count'].quantile(0.2)]

# (Proj Cells 32-34)
c= df['vote_average'].mean()
m= df['vote_count'].quantile(0.2)
def rating(x, m=m, c=c):
    v = x['vote_count']
    r = x['vote_average']
    return (v/(v+m) * r) + (m/(m+v) * c)

df['rate'] = df.apply(rating, axis=1)
df['rate']=round(df['rate'],1) 

# 2. Fill overview (Proj Cell 36)
df['overview']=df['overview'].fillna('')

# 3. Convert popularity (Proj Cell 37)
df['popularity']=df['popularity'].astype(float)

# 4. Filter by runtime (Proj Cell 55)
runtime_outliers = df.loc[(df['runtime'] < 60) | (df['runtime'] > 200)] 
df = df.drop(labels=runtime_outliers.index, axis=0) 

# 5. Filter by popularity (Proj Cell 57)
popularity_outliers = df.loc[df['popularity'] > 12] 
df = df.drop(labels=popularity_outliers.index, axis=0) 


# --- 7. Final Feature Selection & Save (Proj Cell 39) ---
print("Selecting final columns and saving...")
df = df.drop(['belongs_to_collection', 'homepage', 'revenue', 'budget', 'video', 
             'tagline', 'id', 'imdb_id', 'title', 'vote_average', 
            'vote_count', 'spoken_languages', 'production_companies', 
            'production_countries', 'adult', 'status', 'release_date'], axis=1) 

df = df.reset_index(drop=True)
first_column = df.pop('original_title')
df.insert(0, 'original_title', first_column)

# --- 8. Save ---
df.to_csv('my_clean_data.csv', index=False, encoding='utf-8')

print("\n---")
print(f"✅ Success! 'my_clean_data.csv' has been saved with {len(df)} rows.")

Loading 'movies_metadata.csv'...
Running initial cleaning...
Filtering status and handling NaNs (Notebook logic)...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['runtime'].fillna(int(df['runtime'].mean()), inplace=True)


Applying filters in notebook order...
Selecting final columns and saving...

---
✅ Success! 'my_clean_data.csv' has been saved with 34306 rows.


In [6]:
df=pd.read_csv('my_clean_data.csv')
df


Unnamed: 0,original_title,genres,original_language,overview,popularity,poster_path,runtime,date,rate
0,Grumpier Old Men,"['Romance', 'Comedy']",en,A family wedding reignites the ancient feud be...,11.712900,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,101.0,1995,6.5
1,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",en,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,127.0,1995,6.1
2,Father of the Bride Part II,['Comedy'],en,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,106.0,1995,5.7
3,Sabrina,"['Comedy', 'Romance']",en,An ugly duckling having undergone a remarkable...,6.677277,/jQh15y5YB7bWz1NtffNZmRw0s9D.jpg,127.0,1995,6.2
4,Tom and Huck,"['Action', 'Adventure', 'Drama', 'Family']",en,"A mischievous young boy, Tom Sawyer, witnesses...",2.561161,/sGO5Qa55p7wTu7FJcX4H4xIVKvS.jpg,97.0,1995,5.5
...,...,...,...,...,...,...,...,...,...
34301,The Morning After,"['Comedy', 'Drama']",en,The Morning After is a feature film that consi...,0.139936,/rpkDqyKdXahRcZIEC9I02EBSwje.jpg,79.0,2015,5.4
34302,San Michele aveva un gallo,[],it,Sentenced to life imprisonment for illegal act...,0.225051,/j1AN0L4motTt8SBxeTMXDtExsYl.jpg,90.0,1972,6.0
34303,House of Horrors,"['Horror', 'Mystery', 'Thriller']",en,An unsuccessful sculptor saves a madman named ...,0.222814,/yMnq9mL5uYxbRgwKqyz1cVGCJYJ.jpg,65.0,1946,6.2
34304,Robin Hood,"['Drama', 'Action', 'Romance']",en,"Yet another version of the classic epic, with ...",5.683753,/fQC46NglNiEMZBv5XHoyLuOWoN5.jpg,104.0,1991,5.7
