In [None]:
import sys
from pathlib import Path
import pandas as pd
import os

project_root = Path("..").resolve()
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

from imports import *

In [None]:
dataset_path = "../data/processed/imdb_clean.csv"
df = pd.read_csv(dataset_path)
df.head()


In [12]:
# Variáveis numéricas
numeric_features = ['Runtime_Min', 'Meta_score', 'Gross_USD', 'No_of_Votes']

# Variáveis categóricas
categorical_features = ['Certificate']

# Variáveis binárias (gêneros)
binary_features = [c for c in df.columns if c.startswith('Genre_')]

# Variável de texto
text_features = 'Overview'

# Target
y = df['IMDB_Rating']

# Split
X_train, X_test, y_train, y_test = train_test_split(
    df[numeric_features + categorical_features + binary_features + [text_features]],
    y, test_size=0.2, random_state=42
)

In [13]:
# Preprocessamento
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numeric_features),
        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features),
        ('bin', 'passthrough', binary_features),
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english'), text_features)
    ]
)

# Pipeline final
model = Pipeline([
    ('preproc', preprocessor),
    ('regressor', GradientBoostingRegressor(random_state=42))
])


In [14]:
# Treinamento
model.fit(X_train, y_train)

# Avaliação
y_pred = model.predict(X_test)
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))
print("R2:", r2_score(y_test, y_pred))


RMSE: 0.20774026639392196
R2: 0.34257482357233504




In [22]:
import os
import joblib

# Caminho relativo à raiz do projeto
model_dir = '../models'  # sobe uma pasta do notebooks até a raiz
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

# Salvar o modelo
joblib.dump(model, os.path.join(model_dir, 'imdb_rating_model.pkl'))
print("Modelo salvo em:", os.path.join(model_dir, 'imdb_rating_model.pkl'))

Modelo salvo em: ../models\imdb_rating_model.pkl


In [23]:
def create_genre_dict():
    genre_cols = [
        'Drama', 'Comedy', 'Crime', 'Adventure', 'Action', 'Thriller', 'Romance', 
        'Biography', 'Mystery', 'Animation', 'Sci-Fi', 'Fantasy', 'Family', 'History',
        'War', 'Music', 'Horror', 'Western', 'Film-Noir', 'Sport'
    ]
    return {genre: 0 for genre in genre_cols}

genre_dict = create_genre_dict()


In [24]:
def predict_imdb_rating(movie_dict, model, genre_cols):
    df = pd.DataFrame(columns=['Runtime_Min', 'Meta_score', 'Gross_USD', 'No_of_Votes', 'Certificate', 'Overview'] + genre_cols)
    
    # Preencher colunas
    df.at[0, 'Runtime_Min'] = movie_dict.get('Runtime_Min', None)
    df.at[0, 'Meta_score'] = movie_dict.get('Meta_score', None)
    df.at[0, 'Gross_USD'] = movie_dict.get('Gross_USD', None)
    df.at[0, 'No_of_Votes'] = movie_dict.get('No_of_Votes', None)
    df.at[0, 'Certificate'] = movie_dict.get('Certificate', None)
    df.at[0, 'Overview'] = movie_dict.get('Overview', '')
    
    # Inicializar gêneros
    for col in genre_cols:
        df.at[0, col] = 0
    
    genres = movie_dict.get('Genre', [])
    if isinstance(genres, str):
        genres = [genres]
    for g in genres:
        if g in genre_cols:
            df.at[0, g] = 1
    
    predicted_rating = model.predict(df)[0]
    return round(predicted_rating, 2)


In [25]:
new_movie = {
    'Series_Title': 'The Shawshank Redemption',
    'Released_Year': 1994,
    'Certificate': 'A',
    'Runtime_Min': 142,
    'Genre': 'Drama',
    'Overview': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
    'Meta_score': 80.0,
    'Director': 'Frank Darabont',
    'Star1': 'Tim Robbins',
    'Star2': 'Morgan Freeman',
    'Star3': 'Bob Gunton',
    'Star4': 'William Sadler',
    'No_of_Votes': 2343110,
    'Gross_USD': 28341469
}

predicted_rating = predict_imdb_rating(new_movie, model, list(genre_dict.keys()))
print("Predicted IMDB Rating:", predicted_rating)


Predicted IMDB Rating: 8.61


Perfeito! Aqui está uma versão completa e profissional em inglês que responde todas as perguntas do enunciado, baseada no que você já fez:

---

**IMDB Rating Prediction Explanation**

To predict the IMDB rating of a movie, I treated the task as a **regression problem** because the target variable (`IMDB_Rating`) is continuous. The goal is to estimate the numeric rating based on various features describing the movie.

**Variables and Transformations**

* **Numeric features:** `Runtime_Min`, `Meta_score`, `Gross_USD`, `No_of_Votes`. These variables are already numerical and were used directly, with missing values imputed when necessary.
* **Categorical features:** `Certificate`. One-hot encoding was applied to convert the categories into binary columns suitable for the model.
* **Text feature:** `Overview`. The text was transformed using TF-IDF vectorization to capture important keywords and semantic information that might correlate with the movie's rating.
* **Genre:** Already one-hot encoded as multiple binary columns representing each possible genre. This allows the model to account for genre-specific patterns in IMDB ratings.

**Model Choice**

I selected a **Gradient Boosting Regressor** because it handles heterogeneous feature types well, captures nonlinear relationships, and is robust to outliers.

* **Pros:** Strong predictive power, can model complex interactions, works well with mixed numeric and categorical inputs after preprocessing.
* **Cons:** Computationally more intensive than linear models, harder to interpret, sensitive to hyperparameters if not tuned.

**Performance Metrics**

* **Root Mean Squared Error (RMSE):** 0.208 – gives an estimate of the average prediction error in rating points.
* **R² Score:** 0.343 – indicates that approximately 34% of the variance in IMDB ratings is explained by the model.

RMSE was chosen because it penalizes large errors more heavily and is intuitive in the same scale as the ratings. R² complements RMSE by indicating how much variance the model can explain.

**Prediction Example**

For the movie *The Shawshank Redemption* with the following characteristics:

```python
{
 'Series_Title': 'The Shawshank Redemption',
 'Released_Year': 1994,
 'Certificate': 'A',
 'Runtime_Min': 142,
 'Genre': 'Drama',
 'Overview': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
 'Meta_score': 80.0,
 'Director': 'Frank Darabont',
 'Star1': 'Tim Robbins',
 'Star2': 'Morgan Freeman',
 'No_of_Votes': 2343110,
 'Gross_USD': 28341469
}
```

The model predicts an **IMDB rating of 8.61**, which is consistent with the known high rating of this classic film.

---

Se você quiser, posso também escrever **uma versão ainda mais resumida**, própria para colocar direto no relatório de forma concisa, sem perder profissionalismo. Quer que eu faça isso também?
