# Import Libs

In [36]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

# Read Data

In [37]:
amazon_df = pd.read_csv("..\\data\\raw\\amazon\\data.csv")
apple_df = pd.read_csv("..\\data\\raw\\apple\\data.csv")
hbo_df = pd.read_csv("..\\data\\raw\\hbo\\data.csv")
netflix_df = pd.read_csv("..\\data\\raw\\netflix\\data.csv")

## Display data

In [38]:
amazon_df.head()

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,Ariel,movie,"Comedy, Crime, Romance",1988.0,tt0094675,7.4,9061.0,
1,Four Rooms,movie,Comedy,1995.0,tt0113101,6.7,113746.0,
2,Judgment Night,movie,"Action, Crime, Drama",1993.0,tt0107286,6.6,19770.0,
3,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2361972.0,
4,Citizen Kane,movie,"Drama, Mystery",1941.0,tt0033467,8.3,478980.0,


In [39]:
apple_df.head()

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,Four Rooms,movie,Comedy,1995.0,tt0113101,6.7,113746.0,
1,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2361972.0,
2,American Beauty,movie,Drama,1999.0,tt0169547,8.3,1243981.0,
3,Citizen Kane,movie,"Drama, Mystery",1941.0,tt0033467,8.3,478980.0,
4,Metropolis,movie,"Drama, Sci-Fi",1927.0,tt0017136,8.3,193058.0,


In [40]:
hbo_df.head()

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,Jarhead,movie,"Biography, Drama, War",2005.0,tt0418763,7.0,214623.0,
1,Unforgiven,movie,"Drama, Western",1992.0,tt0105695,8.2,451718.0,
2,Eternal Sunshine of the Spotless Mind,movie,"Drama, Romance, Sci-Fi",2004.0,tt0338013,8.3,1128989.0,
3,2001: A Space Odyssey,movie,"Adventure, Sci-Fi",1968.0,tt0062622,8.3,748830.0,
4,Absolute Power,movie,"Action, Crime, Drama",1997.0,tt0118548,6.7,62446.0,


In [41]:
netflix_df.head()

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries
0,Ariel,movie,"Comedy, Crime, Romance",1988.0,tt0094675,7.4,9061.0,
1,Shadows in Paradise,movie,"Comedy, Drama, Music",1986.0,tt0092149,7.4,7862.0,
2,American Beauty,movie,Drama,1999.0,tt0169547,8.3,1243981.0,
3,The Fifth Element,movie,"Action, Adventure, Sci-Fi",1997.0,tt0119116,7.6,524186.0,
4,Kill Bill: Vol. 1,movie,"Action, Crime, Thriller",2003.0,tt0266697,8.2,1242949.0,


# Preprocess Data

In [42]:
amazon_df["platform"] = "amazon"
apple_df["platform"] = "apple"
hbo_df["platform"] = "hbo"
netflix_df["platform"] = "netflix"

In [43]:
joined_df = pd.concat([amazon_df, apple_df, hbo_df, netflix_df], ignore_index=True)

In [44]:
joined_df.head()

Unnamed: 0,title,type,genres,releaseYear,imdbId,imdbAverageRating,imdbNumVotes,availableCountries,platform
0,Ariel,movie,"Comedy, Crime, Romance",1988.0,tt0094675,7.4,9061.0,,amazon
1,Four Rooms,movie,Comedy,1995.0,tt0113101,6.7,113746.0,,amazon
2,Judgment Night,movie,"Action, Crime, Drama",1993.0,tt0107286,6.6,19770.0,,amazon
3,Forrest Gump,movie,"Drama, Romance",1994.0,tt0109830,8.8,2361972.0,,amazon
4,Citizen Kane,movie,"Drama, Mystery",1941.0,tt0033467,8.3,478980.0,,amazon


In [45]:
merged_df = joined_df.groupby("imdbId").agg({
    "title": "first", 
    "type": "first",
    "genres": "first",
    "releaseYear": "first",
    "imdbAverageRating": "first",
    "imdbNumVotes": "first",
    "availableCountries": "first",
    "platform": lambda x: list(sorted(set(x)))  
}).reset_index()

In [46]:
merged_df.head()

Unnamed: 0,imdbId,title,type,genres,releaseYear,imdbAverageRating,imdbNumVotes,availableCountries,platform
0,tt0000417,A Trip to the Moon,movie,"Adventure, Comedy, Fantasy",1902.0,8.1,58167.0,,"[amazon, apple, hbo]"
1,tt0000499,An Impossible Voyage,movie,"Action, Adventure, Family",1904.0,7.5,4235.0,,[hbo]
2,tt0002646,Atlantis,movie,Drama,1913.0,6.5,510.0,,[amazon]
3,tt0003014,Ingeborg Holm,movie,Drama,1913.0,7.0,1513.0,,[netflix]
4,tt0004181,Judith of Bethulia,movie,Drama,1914.0,6.2,1494.0,,[amazon]


In [47]:
merged_df.to_csv("..\\data\\processed\\plataform_data.csv", index= False)

# Features Transformation

## Process data

In [48]:
plataform_df = pd.read_csv("..\\data\\processed\\plataform_data.csv")

In [49]:
plataform_df["genres"] = plataform_df["genres"].fillna("").apply(lambda x: x.split(", "))

In [50]:
plataform_df["platform"] = plataform_df["platform"].fillna("None")

## Feature Engineering

### Label enconding categorical data

In [51]:
mlb = MultiLabelBinarizer()
genres_encoded = mlb.fit_transform(plataform_df["genres"])
genres_df = pd.DataFrame(genres_encoded, columns=mlb.classes_)

### Standard Scaler for numerical features

In [52]:
scaler = StandardScaler()
numerical_features = scaler.fit_transform(plataform_df[["releaseYear", "imdbAverageRating", "imdbNumVotes"]].fillna(0))
numerical_df = pd.DataFrame(numerical_features, columns=["year_scaled", "rating_scaled", "votes_scaled"])

In [53]:
features_df = pd.concat([genres_df, numerical_df], axis=1)

In [54]:
features_df.to_csv("..\\data\\processed\\features.csv", index=False)

In [55]:
features_df = pd.read_csv("..\\data\\processed\\features.csv")

In [56]:
if '' in features_df.columns:
    features_df = features_df.drop(columns=[''])
features_df.columns = [col.strip().replace(" ", "_").replace("&", "and").replace("-", "_") for col in features_df.columns]

# Training Models

## LGMB

In [57]:
favorite_ids = ["tt0126029",
"tt1396484",
"tt5311514",
"tt0209144",
"tt0120338",]
plataform_df["label"] = plataform_df["imdbId"].apply(lambda x: 1 if x in favorite_ids else 0)

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
X = features_df
y = plataform_df["label"]

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.01, 0.05],
    'max_depth': [5, 7, 10],
    'num_leaves': [20, 31, 50],
    'min_child_samples': [10, 20],
}

grid = GridSearchCV(
    estimator=LGBMClassifier(class_weight='balanced', random_state=42),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid.fit(X, y)
print("Melhores parâmetros:", grid.best_params_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


ValueError: 
All the 108 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
108 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\lightgbm\sklearn.py", line 1560, in fit
    super().fit(
    ~~~~~~~~~~~^
        X,
        ^^
    ...<12 lines>...
        init_model=init_model,
        ^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\lightgbm\sklearn.py", line 1049, in fit
    self._Booster = train(
                    ~~~~~^
        params=params,
        ^^^^^^^^^^^^^^
    ...<6 lines>...
        callbacks=callbacks,
        ^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\lightgbm\engine.py", line 297, in train
    booster = Booster(params=params, train_set=train_set)
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\lightgbm\basic.py", line 3656, in __init__
    train_set.construct()
    ~~~~~~~~~~~~~~~~~~~^^
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\lightgbm\basic.py", line 2590, in construct
    self._lazy_init(
    ~~~~~~~~~~~~~~~^
        data=self.data,
        ^^^^^^^^^^^^^^^
    ...<9 lines>...
        position=self.position,
        ^^^^^^^^^^^^^^^^^^^^^^^
    )
    ^
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\lightgbm\basic.py", line 2227, in _lazy_init
    return self.set_feature_name(feature_name)
           ~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\lightgbm\basic.py", line 3046, in set_feature_name
    _safe_call(
    ~~~~~~~~~~^
        _LIB.LGBM_DatasetSetFeatureNames(
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
    ...<3 lines>...
        )
        ^
    )
    ^
  File "c:\Users\pedro\OneDrive\Área de Trabalho\faculdade\nubank_case\nubank_case\nubank\Lib\site-packages\lightgbm\basic.py", line 313, in _safe_call
    raise LightGBMError(_LIB.LGBM_GetLastError().decode("utf-8"))
lightgbm.basic.LightGBMError: Do not support special JSON characters in feature name.


: 

In [22]:
model = grid.best_estimator_

In [23]:
plataform_df["prob_like"] = model.predict_proba(X)[:, 1]

In [None]:
def precision_at_k(y_true, y_scores, k):
    top_k_idx = np.argsort(y_scores)[-k:][::-1]
    top_k_true = np.array(y_true)[top_k_idx]
    return np.sum(top_k_true) / k

def recall_at_k(y_true, y_scores, k):
    top_k_idx = np.argsort(y_scores)[-k:][::-1]
    top_k_true = np.array(y_true)[top_k_idx]
    return np.sum(top_k_true) / np.sum(y_true)

y_proba = model.predict_proba(X)[:, 1]
plataform_df["prob_like"] = y_proba

auc = roc_auc_score(y, y_proba)
p_at_k = precision_at_k(y, y_proba, 30)
r_at_k = recall_at_k(y, y_proba, 30)

print(f"📈 AUC: {auc:.4f}")
print(f"📌 Precision@{30}: {p_at_k:.4f}")
print(f"📌 Recall@{30}: {r_at_k:.4f}")

recommendations = plataform_df[~plataform_df["imdbId"].isin(favorite_ids)].copy()
recommendations = recommendations.sort_values(by="prob_like", ascending=False).head(30)
result = recommendations[["title", "genres", "prob_like", "platform"]]

📈 AUC: 1.0000
📌 Precision@30: 0.1667
📌 Recall@30: 1.0000


In [25]:
recommendations = plataform_df[~plataform_df["imdbId"].isin(favorite_ids)].copy()
recommendations = recommendations.sort_values(by="prob_like", ascending=False).head(10)
final_result = recommendations[["title", "genres", "prob_like", "platform"]]
final_result

Unnamed: 0,title,genres,prob_like,platform
75619,Get Out,"[Horror, Mystery, Thriller]",0.995374,"['amazon', 'apple', 'hbo', 'netflix']"
12622,Princess Mononoke,"[Adventure, Animation, Fantasy]",0.984447,"['hbo', 'netflix']"
20486,Shaun of the Dead,"[Comedy, Horror]",0.964469,"['amazon', 'apple', 'netflix']"
15565,Snatch,"[Comedy, Crime]",0.962053,"['amazon', 'apple', 'hbo', 'netflix']"
12508,Good Will Hunting,"[Drama, Romance]",0.961488,"['amazon', 'apple', 'hbo', 'netflix']"
12401,The Big Lebowski,"[Comedy, Crime]",0.959907,"['amazon', 'apple', 'hbo']"
12822,"Lock, Stock and Two Smoking Barrels","[Comedy, Crime]",0.959907,"['amazon', 'apple']"
82380,A Quiet Place,"[Drama, Horror, Sci-Fi]",0.952862,"['amazon', 'apple', 'hbo', 'netflix']"
19941,Howl's Moving Castle,"[Adventure, Animation, Family]",0.93434,"['hbo', 'netflix']"
16635,Spirited Away,"[Adventure, Animation, Family]",0.917176,"['hbo', 'netflix']"


##  XGBoost

In [26]:
favorite_ids = ["tt0000417", "tt0000499", "tt0002646", "tt0003014", "tt0004181"]
plataform_df["label"] = plataform_df["imdbId"].apply(lambda x: 1 if x in favorite_ids else 0)

In [27]:
X = features_df
y = plataform_df["label"]

In [28]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', scale_pos_weight=(len(y)-sum(y))/sum(y), random_state=42)
model.fit(X, y)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [29]:
plataform_df["prob_like"] = model.predict_proba(X)[:, 1]

In [30]:
recommendations = plataform_df[~plataform_df["imdbId"].isin(favorite_ids)]
recommendations = recommendations.sort_values(by="prob_like", ascending=False).head(10)

In [31]:
final_result = recommendations[["title", "genres", "prob_like", "platform"]]
final_result

Unnamed: 0,title,genres,prob_like,platform
13309,The Infernal Cauldron,"[Horror, Short]",0.208479,['hbo']
13322,Four Heads Are Better Than One,"[Comedy, Short]",0.128902,['hbo']
29156,Charlemagne,[Drama],0.036118,['amazon']
45809,Maybelline Prince,[Drama],0.036118,['amazon']
53099,Ferrari,[Drama],0.036118,['amazon']
62363,Suburbios del Alma,[Drama],0.036118,['amazon']
67982,Untitled One Tree Hill Sequel,[Drama],0.036118,"['amazon', 'hbo']"
16080,Whimsical Illusions,[Short],0.031519,['hbo']
13304,The Hilarious Posters,"[Comedy, Fantasy, Short]",0.031519,['hbo']
13231,The Witch,"[Fantasy, Short]",0.031519,['hbo']


# Random Forest

In [32]:
favorite_ids = ["tt0000417", "tt0000499", "tt0002646", "tt0003014", "tt0004181"]
plataform_df["label"] = plataform_df["imdbId"].apply(lambda x: 1 if x in favorite_ids else 0)

In [33]:
X = features_df
y = plataform_df["label"]

In [34]:
classes = np.unique(y)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y)
class_weight_dict = dict(zip(classes, class_weights))
model = RandomForestClassifier(class_weight=class_weight_dict, random_state=42)
model.fit(X, y)

In [35]:
plataform_df["prob_like"] = model.predict_proba(X)[:, 1]

In [36]:
recommendations = plataform_df[~plataform_df["imdbId"].isin(favorite_ids)]
recommendations = recommendations.sort_values(by="prob_like", ascending=False).head(10)

In [37]:
final_result = recommendations[["title", "genres", "prob_like", "platform"]]
final_result

Unnamed: 0,title,genres,prob_like,platform
13322,Four Heads Are Better Than One,"[Comedy, Short]",0.11,['hbo']
13309,The Infernal Cauldron,"[Horror, Short]",0.09,['hbo']
16080,Whimsical Illusions,[Short],0.07,['hbo']
13568,Salome,"[Drama, History, Short]",0.06,['amazon']
16107,The Magician and the Human Pump,"[Fantasy, Short]",0.06,['hbo']
13304,The Hilarious Posters,"[Comedy, Fantasy, Short]",0.03,['hbo']
16144,An Impossible Balancing Feat,"[Fantasy, Short]",0.02,['hbo']
13231,The Witch,"[Fantasy, Short]",0.02,['hbo']
7,The Cheat,"[Drama, Romance]",0.02,['amazon']
3967,Thunderbirds,"[Action, Adventure, Family]",0.01,['amazon']
