In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import polars as pl
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb
from tqdm.auto import tqdm
import joblib

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [3]:
DATA_ROOT   = Path('./data')
WORKING_DIR = Path('./')
STUDY_PATH = WORKING_DIR / 'studies'

In [4]:
X = pd.read_csv(DATA_ROOT / 'X_train.csv')
y = pd.read_csv(DATA_ROOT / 'train_labels.csv')
X_test = pd.read_csv(DATA_ROOT / 'X_test.csv')

genre_cols = list(set(y.columns) - set(['imdb_score']))

In [5]:
def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    
    if cat_cols is None:
        print(df_data.select_dtypes("object").columns)
        cat_cols = list(df_data.select_dtypes("object").columns)
    
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    
    return df_data

## Preprocess data

In [6]:
def handle_types(df):
    str_cols = [col for col in df.columns if df[col].dtype == pl.String]
    df = df.with_columns(pl.col(str_cols).cast(pl.Categorical))
    
    return df

In [7]:
class VotingModel(BaseEstimator, RegressorMixin):
    def __init__(self, estimators):
        super().__init__()
        self.estimators = estimators
        
    def fit(self, X, y=None):
        return self
    
    def predict(self, X):
        y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)
    
    def predict_proba(self, X):
        try:
            y_preds = [estimator.predict_proba(X) for estimator in self.estimators]
        except AttributeError:
            y_preds = [estimator.predict(X) for estimator in self.estimators]
        return np.mean(y_preds, axis=0)

In [8]:
def generate_preds_on_textcol(X_train, X_test, y_train, target_cols, prefix_name=''):
    encoder = TfidfVectorizer()
    encoder.fit(X_train.values)

    X_train = encoder.transform(X_train)
    X_test = encoder.transform(X_test)

    preds = pd.DataFrame()
    for col in target_cols:
        # se puede probar sin loss balanceada
        model = LogisticRegression(class_weight='balanced').fit(X_train, y_train[col])
        preds[prefix_name+col] = model.predict_proba(X_test)[:, 1]
    
    return preds

In [9]:
# preparar las columnas de strings
X['plot_keywords'] = X['plot_keywords'].str.replace("|", " ")

processed = []
for i in tqdm(range(len(X))):
    processed.append(" ".join(eval(X['keywords'].iloc[i])))
    
X['keywords'] = processed

# feature que concatena textos
X['plot_x_plot_keywords_x_overview'] = X['plot_keywords'] + " " + \
                                            X['keywords'] + " " + \
                                            X['overview']

  X['plot_keywords'] = X['plot_keywords'].str.replace("|", " ")


  0%|          | 0/2894 [00:00<?, ?it/s]

In [10]:
# preparar las columnas de strings
X_test['plot_keywords'] = X_test['plot_keywords'].str.replace("|", " ")

processed = []
for i in tqdm(range(len(X_test))):
    processed.append(" ".join(eval(X_test['keywords'].iloc[i])))
    
X_test['keywords'] = processed


# feature que concatena textos
X_test['plot_x_plot_keywords_x_overview'] = X_test['plot_keywords'] + " " + \
                                            X_test['keywords'] + " " + \
                                            X_test['overview']

for text_col in ['plot_keywords', 'keywords', 'overview', 'plot_x_plot_keywords_x_overview']:

    preds = generate_preds_on_textcol(X[text_col], X_test[text_col], y, genre_cols, prefix_name=f'p_{text_col}')
    X_test[preds.columns] = preds

  X_test['plot_keywords'] = X_test['plot_keywords'].str.replace("|", " ")


  0%|          | 0/724 [00:00<?, ?it/s]

## Regression

In [11]:
model = joblib.load('./models/regression.pkl')

In [12]:
X = X_test[model.estimators[0].feature_name_]
X = pl.from_pandas(X).pipe(handle_types)

X = to_pandas(X)

Index([], dtype='object')


In [13]:
out = model.predict(X).tolist()

In [14]:
with open("test_predictions_scores.txt", "w") as f:
    f.write(str(out))

## Classification

In [15]:
models = joblib.load('./models/classification.pkl')

In [16]:
X_test = X_test.rename(columns={
    'p_plot_keywordsScience Fiction': 'p_plot_keywordsScience_Fiction',
    'p_keywordsScience Fiction': 'p_keywordsScience_Fiction',
    'p_overviewScience Fiction': 'p_overviewScience_Fiction',
    'p_plot_x_plot_keywords_x_overviewScience Fiction': 'p_plot_x_plot_keywords_x_overviewScience_Fiction'
})

In [17]:
X = X_test.reset_index()[models[0].estimators[0].feature_name_]
X = pl.from_pandas(X).pipe(handle_types)

X = to_pandas(X)

Index([], dtype='object')


In [18]:
out = np.array([model.predict(X) for model in models]).T

In [25]:
out = (out > 0.5).astype(int)

In [28]:
out.shape

(724, 19)

In [29]:
with open("test_predictions_genres.txt", "w") as f:
    f.write(str(out.tolist()))