## 1) Setup & Imports

In [None]:
# Cell 1
from __future__ import annotations

from pathlib import Path
import ast

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
# Cell 2
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
pd.set_option('display.max_columns', 50)
sns.set_theme(style='whitegrid')
print('Ready.')

## 2) Load Dataset (robust path for VS Code/Windows)

In [None]:
# Cell 3
PROJECT_DIR = Path.cwd().resolve()
CSV_NAME = 'Top_10000_Movies.csv'

candidate = PROJECT_DIR / CSV_NAME
if not candidate.exists():
    matches = list(PROJECT_DIR.rglob(CSV_NAME))
    candidate = matches[0] if matches else candidate

print('Project directory:', PROJECT_DIR)
print('CSV path:', candidate)
print('CSV exists:', candidate.exists())
DATA_PATH = candidate

In [None]:
# Cell 4
df = pd.read_csv(DATA_PATH)

# Drop common index-like column if present
for col in ['Unnamed: 0', '']:
    if col in df.columns:
        df = df.drop(columns=[col])

print('Shape:', df.shape)
df.head()

## 3) Quick EDA (academic, minimal)

In [None]:
# Cell 5
df.info()

In [None]:
# Cell 6
missing_pct = (df.isna().mean() * 100).sort_values(ascending=False)
missing_pct.head(15)

In [None]:
# Cell 7
df.describe(include='all').T.head(25)

## 4) Basic Cleaning & Feature Prep

In [None]:
# Cell 8
df = df.drop_duplicates().copy()

# Parse release_date (keep year as a numeric feature)
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')
df['release_year'] = df['release_date'].dt.year

# Convert genre list-string into a simple pipe-separated string (e.g., Action|Drama)
def normalize_genre(value: object) -> str:
    if pd.isna(value):
        return 'Unknown'
    if isinstance(value, str):
        try:
            parsed = ast.literal_eval(value)
            if isinstance(parsed, list):
                return '|'.join([str(x).strip() for x in parsed if str(x).strip()]) or 'Unknown'
        except Exception:
            pass
        value = value.strip()
        return value if value else 'Unknown'
    return str(value)

if 'genre' in df.columns:
    df['genre_clean'] = df['genre'].apply(normalize_genre)

df[['original_title', 'vote_average', 'vote_count', 'genre', 'genre_clean']].head()

## 5) Define Target (High-Rated Classification)
We create a binary label: `high_rated = 1` if `vote_average` is greater than or equal to the dataset median.

In [None]:
# Cell 9
if 'vote_average' not in df.columns:
    raise ValueError('Expected column vote_average not found in dataset.')

median_rating = df['vote_average'].median()
df['high_rated'] = (df['vote_average'] >= median_rating).astype(int)

print('Median vote_average =', median_rating)
df['high_rated'].value_counts(normalize=True).rename('proportion')

## 6) Train/Test Split + Preprocessing

In [None]:
# Cell 10
target_col = 'high_rated'

# Minimal feature set (simple + robust)
feature_cols = [
    'original_language',
    'popularity',
    'vote_count',
    'revenue',
    'runtime',
    'release_year',
    'genre_clean',
]

feature_cols = [c for c in feature_cols if c in df.columns]
print('Using features:', feature_cols)

X = df[feature_cols].copy()
y = df[target_col].copy()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print('Train shape:', X_train.shape)
print('Test shape:', X_test.shape)

In [None]:
# Cell 11
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(exclude=[np.number]).columns.tolist()

print('Numeric:', numeric_features)
print('Categorical:', categorical_features)

# OneHotEncoder API differs slightly by sklearn version
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

numeric_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='median')),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', ohe),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
    ],
    remainder='drop'
)
preprocessor

## 7) Model (Only One): RandomForestClassifier

In [None]:
# Cell 12
model = RandomForestClassifier(
    n_estimators=300,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    class_weight='balanced_subsample'
)

clf = Pipeline(
    steps=[
        ('preprocess', preprocessor),
        ('model', model),
    ]
)

clf

In [None]:
# Cell 13
clf.fit(X_train, y_train)
print('Model trained.')

## 8) Evaluation

In [None]:
# Cell 14
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', round(acc, 4))

In [None]:
# Cell 15
print(classification_report(y_test, y_pred, digits=4))

In [None]:
# Cell 16
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

## 9) Feature Importance (Top 15)

In [None]:
# Cell 17
# Extract feature names after preprocessing
pre = clf.named_steps['preprocess']
rf = clf.named_steps['model']

try:
    feature_names = pre.get_feature_names_out()
except Exception:
    feature_names = np.array([f'feature_{i}' for i in range(len(rf.feature_importances_))])

importances = rf.feature_importances_
fi = pd.DataFrame({'feature': feature_names, 'importance': importances})
fi = fi.sort_values('importance', ascending=False).head(15)
fi

In [None]:
# Cell 18
plt.figure(figsize=(10, 5))
sns.barplot(data=fi, x='importance', y='feature')
plt.title('Top 15 Feature Importances (RandomForest)')
plt.tight_layout()
plt.show()