In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
def train_metric_group(grp):
    X_train, X_test, y_train, y_test = train_test_split(
        grp.drop(columns=["id", "film_id", "rate", "label"]),
        grp.label,
        test_size=0.2,
        stratify=grp.label,
        random_state=42,
    )
    model = RandomForestClassifier(n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return pd.Series(
        {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1": f1_score(y_test, y_pred),
        }
    )

In [3]:
df = pd.read_csv("dataset.csv", index_col=0)

In [4]:
films_num_rates = df.film_id.value_counts()

In [5]:
df_over_500_rates = df[df.film_id.isin(films_num_rates[films_num_rates >= 500].index)]

In [6]:
df_over_500_rates = df_over_500_rates.assign(label=0)
df_over_500_rates.loc[(df_over_500_rates.rate >= 7), "label"] = 1

In [7]:
result = df_over_500_rates.groupby("film_id").apply(train_metric_group)

In [8]:
result.to_csv("result.csv")

In [9]:
result

Unnamed: 0_level_0,accuracy,precision,recall,f1
film_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
111161,0.982456,0.982456,1.000000,0.991150
359950,0.816176,0.822222,0.991071,0.898785
369610,0.729167,0.748201,0.962963,0.842105
451279,0.774194,0.774194,1.000000,0.872727
454876,0.906542,0.906542,1.000000,0.950980
...,...,...,...,...
7286456,0.946128,0.945946,1.000000,0.972222
7653254,0.904762,0.901961,1.000000,0.948454
8367814,0.941176,0.940741,1.000000,0.969466
8579674,0.964516,0.963816,1.000000,0.981575


In [10]:
result.mean()

accuracy     0.821888
precision    0.827411
recall       0.951665
f1           0.883314
dtype: float64