In [94]:
from pyspark.sql.functions import col,when,lit
from pyspark.sql import SparkSession
import os
import duckdb
import pandas as pd
import numpy as np
import math
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import auc, accuracy_score, roc_auc_score, roc_curve,log_loss
from sklearn.model_selection import GridSearchCV
import optuna  # pip install optuna
from optuna.integration import LightGBMPruningCallback


In [None]:
#Reduce memory function always useful
def reduce_memory(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:



#Create PySpark SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("IMDB") \
    .getOrCreate()


#Connecting with database
con = duckdb.connect(database='my-db.duckdb', read_only=False)


# CSV MANIPULATIONS 

In [4]:
# This should be done automatically by identifying .csv files, then check if table exists
# If table exists then it should append and not create new
#TODO

# Hint: Use something similar to this

# files = [f for f in os.listdir('.') if os.path.isfile(f)]
# for f in files:
#     filename = f.split(".")
#     if filename[1] == "csv" and ("train" in filename[0]):
#         con.execute("CREATE TABLE filename[0] AS SELECT * FROM f");
#         con.execute("SELECT * FROM filename[0]").fetchdf()

con.execute("CREATE TABLE test_hidden AS SELECT * FROM 'test_hidden.csv'");
con.execute("SELECT * FROM test_hidden").fetchdf()

con.execute("CREATE TABLE train_1 AS SELECT * FROM 'train-1.csv'");
con.execute("SELECT * FROM train_1").fetchdf()

con.execute("CREATE TABLE train_2 AS SELECT * FROM 'train-2.csv'");
con.execute("SELECT * FROM train_2").fetchdf()

con.execute("CREATE TABLE train_3 AS SELECT * FROM 'train-3.csv'");
con.execute("SELECT * FROM train_3").fetchdf()

con.execute("CREATE TABLE train_4 AS SELECT * FROM 'train-4.csv'");
con.execute("SELECT * FROM train_4").fetchdf()

con.execute("CREATE TABLE train_5 AS SELECT * FROM 'train-5.csv'");
con.execute("SELECT * FROM train_5").fetchdf()

con.execute("CREATE TABLE train_6 AS SELECT * FROM 'train-6.csv'");
con.execute("SELECT * FROM train_6").fetchdf()

con.execute("CREATE TABLE train_7 AS SELECT * FROM 'train-7.csv'");
con.execute("SELECT * FROM train_7").fetchdf()

con.execute("CREATE TABLE train_8 AS SELECT * FROM 'train-8.csv'");
con.execute("SELECT * FROM train_8").fetchdf()

con.execute("CREATE TABLE validation_hidden AS SELECT * FROM 'validation_hidden.csv'");
con.execute("SELECT * FROM validation_hidden").fetchdf()







RuntimeError: Catalog Error: Table with name "test_hidden" already exists!

# JSON MANIPULATIONS

In [9]:
jsonStr = 'writing.json'
writing_df = pd.read_json(jsonStr)
jsonStr = 'directing.json'
directing_df = pd.read_json(jsonStr)
# Convert JSON to DataFrame Using read_json()
writing_df = pd.read_json(jsonStr)
con.execute("CREATE TABLE writing AS SELECT * FROM 'writing_df'");

jsonStr = 'directing.json'
# Convert JSON to DataFrame Using read_json()
directing_df = pd.read_json(jsonStr)
con.execute("CREATE TABLE directing AS SELECT * FROM 'directing_df'");






RuntimeError: Catalog Error: Table with name "writing" already exists!

In [11]:
#Make sure everyting is here
con.execute("show tables;").fetchdf()

Unnamed: 0,name
0,directing
1,test_hidden
2,train_1
3,train_2
4,train_3
5,train_4
6,train_5
7,train_6
8,train_7
9,train_8


In [11]:
#TODO : This should also be done programmatically and not manually
df01 = con.execute("SELECT * FROM train_1").fetchdf()
df02 = con.execute("SELECT * FROM train_2").fetchdf()
df03 = con.execute("SELECT * FROM train_3").fetchdf()
df04 = con.execute("SELECT * FROM train_4").fetchdf()
df05 = con.execute("SELECT * FROM train_5").fetchdf()
df06 = con.execute("SELECT * FROM train_6").fetchdf()
df07 = con.execute("SELECT * FROM train_7").fetchdf()
df08 = con.execute("SELECT * FROM train_8").fetchdf()
director_df = con.execute("SELECT * FROM directing").fetchdf()
writing_df = con.execute("SELECT * FROM writing").fetchdf()

df0 = pd.concat([df01,df02,df03,df04,df05,df06,df07,df08], axis = 0)

In [12]:

# ----------Runtime Minutes features------------
df0['runtimeMinutes'] = np.where(df0['runtimeMinutes'] == "\\N", np.nan, df0["runtimeMinutes"])
df0["runtimeMinutes"] = pd.to_numeric(df0["runtimeMinutes"],downcast='integer')

# ----------Title features------------
df0['originalTitle'] = np.where(df0['originalTitle'] == df0['primaryTitle'],\
                                np.nan, df0["originalTitle"])
df0["isForeign"] =  np.where(df0['originalTitle'].isna(),\
                                False, True)

# -------------------Year features--------------
df0['startYear'] = np.where(df0['startYear'] == "\\N", df0['endYear'], df0["startYear"])
df0["startYear"] = pd.to_numeric(df0["startYear"],downcast='integer')
df0["yearsSinceRelease"] = 2022 - df0["startYear"]


# Clean NULL, DROP USELESS
df0 = df0.dropna(how = "any",subset=["numVotes","label","runtimeMinutes"])
df0 = df0.dropna(how = "all",subset=["primaryTitle","originalTitle"])
df0 = df0.drop(columns=['endYear',"originalTitle","primaryTitle"])
df0["startYear"] = df0["startYear"].astype(str)
df0= reduce_memory(df0)

df0["startYear"] = df0["startYear"].astype(str)
#TODO Create a trimmed mean removal method for outliers in numVotes and runtimeMinutes
#TODO: This should be done for each df separetely, so we can save them to the db separately and
try:
    con.execute("DROP TABLE clean_data");
    con.execute("CREATE TABLE clean_data AS SELECT * FROM 'df0'");
except:
    con.execute("CREATE TABLE clean_data AS SELECT * FROM 'df0'");
# -------------------Writing features--------------
writing_df["writer"] =  np.where(writing_df['writer'] == "\\N",\
                                np.nan, writing_df["writer"])
writing_df = writing_df.dropna(how = "any",subset=["writer"])
try:
    con.execute("DROP TABLE clean_writers");
    con.execute("CREATE TABLE clean_writers AS SELECT * FROM 'writing_df'");
except:
    con.execute("CREATE TABLE clean_writers AS SELECT * FROM 'writing_df'");
    
# -------------------Director features--------------
directing_df["director"] =  np.where(directing_df['director'] == "\\N",\
                                np.nan, directing_df["director"])
directing_df = directing_df.dropna(how = "any",subset=["director"])
try:
    con.execute("DROP TABLE clean_directors");
    con.execute("CREATE TABLE clean_directors AS SELECT * FROM 'directing_df'");
except:
    con.execute("CREATE TABLE clean_directors AS SELECT * FROM 'directing_df'");
    


Mem. usage decreased to  0.25 Mb (22.9% reduction)


### Everything is saved clean to the DB as of now

#### Now preprocess for model (Should be done in PySpark)

In [13]:
#Pivot the director table
pivot_directors=pd.pivot_table(directing_df,index=["movie"], columns=['director'],
                        aggfunc=len, fill_value=0).reset_index()
#Add suffix to indicate that the person was a director
pivot_directors = pivot_directors.add_suffix('_dir')

#Pivot the writer table
pivot_writer=pd.pivot_table(writing_df,index=["movie"], columns=['writer'],
                        aggfunc=len, fill_value=0).reset_index()
#Add suffix to indicate that the person was a writer
pivot_writer = pivot_writer.add_suffix('_writ')

#Pivot our data table on startYear to make the year hot encoded
pivot_data = pd.pivot_table(df0,\
                              index=["column0",\
                                     "isForeign",\
                                     "label",\
                                     "yearsSinceRelease","numVotes","tconst","runtimeMinutes"]\
                              ,columns=['startYear'],aggfunc=len,fill_value=0).reset_index()
#Merge data with writer
merged_df = pd.merge(pivot_data, pivot_writer, left_on='tconst',right_on="movie_writ", how="inner")
#Merge data with director
merged_df = pd.merge(merged_df, pivot_directors, left_on='tconst',right_on="movie_dir", how="inner")

#Make True And False to 1 and 0, maybe not useful, dunno why did it
merged_df["isForeign"] = np.where(merged_df['isForeign'] == False,\
                                0, 1)
merged_df["label"] = np.where(merged_df['label'] == False,\
                                0, 1)

#TODO: Add feature sameDirectorAndWriter

## Check here if everything is as it should be, merged_df is the final dataframe where it has all features and everything. If anything is wrong in the dataframe you can find it here.

#### Split the train set

In [77]:
X_train = merged_df.drop(['column0','tconst','movie_writ',"movie_dir","sameDirAndWrit"],axis=1)
y_train = merged_df['label']

In [155]:
fake_valid_inds = np.random.choice(X_train.index.values, 1000, replace = False)
train_inds = np.setdiff1d(X_train.index.values, fake_valid_inds)
train_data = lgb.Dataset(X_train.loc[train_inds] , label = y_train.loc[train_inds], 
                          free_raw_data=False)
eval_data = lgb.Dataset(X_train.loc[fake_valid_inds], label = y_train.loc[fake_valid_inds],
                 free_raw_data=False)

#### Hyperparameter Tuning ----SKIP THIS FOR NOW----------

In [90]:
def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "n_estimators": trial.suggest_categorical("n_estimators", [10000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 200, 10000, step=100),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMClassifier(objective="binary", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_logloss",
            early_stopping_rounds=200,
            callbacks=[
                LightGBMPruningCallback(trial, "binary_logloss")
            ],  # Add a pruning callback
        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

In [95]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X_train, y_train)
study.optimize(func, n_trials=20)

[32m[I 2022-03-22 19:13:53,981][0m A new study created in memory with name: LGBM Classifier[0m




























[32m[I 2022-03-22 19:14:27,783][0m Trial 0 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.22719812422996097, 'num_leaves': 2100, 'max_depth': 10, 'min_data_in_leaf': 3900, 'lambda_l1': 30, 'lambda_l2': 20, 'min_gain_to_split': 11.86868658670658, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:14:58,934][0m Trial 1 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.22972199434919077, 'num_leaves': 1260, 'max_depth': 12, 'min_data_in_leaf': 2700, 'lambda_l1': 80, 'lambda_l2': 65, 'min_gain_to_split': 10.237144834104203, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:15:29,574][0m Trial 2 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.12502330119384258, 'num_leaves': 760, 'max_depth': 9, 'min_data_in_leaf': 4900, 'lambda_l1': 50, 'lambda_l2': 55, 'min_gain_to_split': 6.576066499802656, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:16:00,844][0m Trial 3 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.14194446343993236, 'num_leaves': 960, 'max_depth': 7, 'min_data_in_leaf': 7200, 'lambda_l1': 15, 'lambda_l2': 95, 'min_gain_to_split': 3.5268897732258067, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:16:31,455][0m Trial 4 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.26565534389717743, 'num_leaves': 680, 'max_depth': 8, 'min_data_in_leaf': 5400, 'lambda_l1': 15, 'lambda_l2': 75, 'min_gain_to_split': 8.252579283813404, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:17:02,495][0m Trial 5 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.045980663173067274, 'num_leaves': 1840, 'max_depth': 11, 'min_data_in_leaf': 8100, 'lambda_l1': 100, 'lambda_l2': 45, 'min_gain_to_split': 9.04387461900833, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:17:34,040][0m Trial 6 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.0789730472296185, 'num_leaves': 1820, 'max_depth': 11, 'min_data_in_leaf': 9100, 'lambda_l1': 55, 'lambda_l2': 100, 'min_gain_to_split': 1.9347122605366889, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:18:05,937][0m Trial 7 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.1198580033785835, 'num_leaves': 1180, 'max_depth': 6, 'min_data_in_leaf': 5400, 'lambda_l1': 0, 'lambda_l2': 15, 'min_gain_to_split': 11.62915763387031, 'bagging_fraction': 0.8, 'bagging_freq': 1, 'feature_fraction': 0.2}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:18:37,816][0m Trial 8 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.11087458587505764, 'num_leaves': 2920, 'max_depth': 11, 'min_data_in_leaf': 6700, 'lambda_l1': 20, 'lambda_l2': 80, 'min_gain_to_split': 12.945322977927159, 'bagging_fraction': 0.9, 'bagging_freq': 1, 'feature_fraction': 0.6000000000000001}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:19:09,299][0m Trial 9 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.29270655149041963, 'num_leaves': 660, 'max_depth': 11, 'min_data_in_leaf': 1100, 'lambda_l1': 30, 'lambda_l2': 5, 'min_gain_to_split': 5.631012849646685, 'bagging_fraction': 0.30000000000000004, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:19:40,497][0m Trial 10 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.19982802043208625, 'num_leaves': 2640, 'max_depth': 3, 'min_data_in_leaf': 3000, 'lambda_l1': 55, 'lambda_l2': 30, 'min_gain_to_split': 14.738210813336092, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.8}. Best is trial 0 with value: 0.6930053623237464.[0m
































[32m[I 2022-03-22 19:20:12,032][0m Trial 11 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.21656518046514198, 'num_leaves': 2260, 'max_depth': 12, 'min_data_in_leaf': 2500, 'lambda_l1': 85, 'lambda_l2': 55, 'min_gain_to_split': 10.503044274165259, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:20:43,689][0m Trial 12 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.21204214102788335, 'num_leaves': 60, 'max_depth': 9, 'min_data_in_leaf': 3200, 'lambda_l1': 75, 'lambda_l2': 30, 'min_gain_to_split': 13.032695550574019, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 0 with value: 0.6930053623237464.[0m




























[32m[I 2022-03-22 19:21:15,086][0m Trial 13 finished with value: 0.04552442652144265 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2493369214914636, 'num_leaves': 1500, 'max_depth': 12, 'min_data_in_leaf': 300, 'lambda_l1': 40, 'lambda_l2': 70, 'min_gain_to_split': 9.926607550575667, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.4}. Best is trial 13 with value: 0.04552442652144265.[0m




[32m[I 2022-03-22 19:21:20,562][0m Trial 14 pruned. Trial was pruned at iteration 201.[0m
































[32m[I 2022-03-22 19:21:52,423][0m Trial 15 finished with value: 0.038942312668164204 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2565164454199309, 'num_leaves': 2460, 'max_depth': 10, 'min_data_in_leaf': 1200, 'lambda_l1': 40, 'lambda_l2': 0, 'min_gain_to_split': 11.774245573221547, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 15 with value: 0.038942312668164204.[0m






























[32m[I 2022-03-22 19:22:24,245][0m Trial 16 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2653331589518366, 'num_leaves': 2480, 'max_depth': 4, 'min_data_in_leaf': 1200, 'lambda_l1': 45, 'lambda_l2': 5, 'min_gain_to_split': 4.937060854952728, 'bagging_fraction': 0.4, 'bagging_freq': 1, 'feature_fraction': 0.9}. Best is trial 15 with value: 0.038942312668164204.[0m






























[32m[I 2022-03-22 19:22:56,097][0m Trial 17 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.2572442444771891, 'num_leaves': 1520, 'max_depth': 6, 'min_data_in_leaf': 1500, 'lambda_l1': 65, 'lambda_l2': 80, 'min_gain_to_split': 9.12303400589229, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 15 with value: 0.038942312668164204.[0m
































[32m[I 2022-03-22 19:23:28,152][0m Trial 18 finished with value: 0.03207554026766316 and parameters: {'n_estimators': 10000, 'learning_rate': 0.29638451603062255, 'num_leaves': 2840, 'max_depth': 10, 'min_data_in_leaf': 300, 'lambda_l1': 35, 'lambda_l2': 65, 'min_gain_to_split': 7.394615335964813, 'bagging_fraction': 0.6000000000000001, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 18 with value: 0.03207554026766316.[0m




























[32m[I 2022-03-22 19:24:00,779][0m Trial 19 finished with value: 0.6930053623237464 and parameters: {'n_estimators': 10000, 'learning_rate': 0.29878294729467864, 'num_leaves': 2940, 'max_depth': 8, 'min_data_in_leaf': 2000, 'lambda_l1': 0, 'lambda_l2': 45, 'min_gain_to_split': 0.3779365773304795, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'feature_fraction': 0.30000000000000004}. Best is trial 18 with value: 0.03207554026766316.[0m


In [96]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.03208
	Best params:
		n_estimators: 10000
		learning_rate: 0.29638451603062255
		num_leaves: 2840
		max_depth: 10
		min_data_in_leaf: 300
		lambda_l1: 35
		lambda_l2: 65
		min_gain_to_split: 7.394615335964813
		bagging_fraction: 0.6000000000000001
		bagging_freq: 1
		feature_fraction: 0.30000000000000004


#### Finally train the model

In [163]:
lgb_params = {
                    'objective': 'binary',
                    'metric': 'binary_logloss',
    'n_estimators':10000,
    'learning_rate':0.3,
    'num_leaves':2840,
    'max_depth':10,
    'min_data_in_leaf': 300,
'lambda_l1': 35,
'lambda_l2': 65,
'min_gain_to_split': 7.394615335964813,
'bagging_fraction': 0.6,
'bagging_freq': 1,
'feature_fraction': 0.3
    
                } 

#Antonis way 
x_train,x_test,y_train,y_test = train_test_split(X_train,y_train,test_size=0.10,random_state=42)
model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
model.fit(x_train,y_train,eval_set=[(x_test,y_test),(x_train,y_train)],
          verbose=20,eval_metric='f1_score')


# m_lgb = lgb.train(lgb_params, train_data, valid_sets = [eval_data], verbose_eval=20) 






[20]	training's binary_logloss: 0.0837317	valid_0's binary_logloss: 0.0837333
[40]	training's binary_logloss: 0.0133227	valid_0's binary_logloss: 0.0133229
[60]	training's binary_logloss: 0.00218889	valid_0's binary_logloss: 0.00218893
[80]	training's binary_logloss: 0.000361461	valid_0's binary_logloss: 0.000361468
[100]	training's binary_logloss: 5.97393e-05	valid_0's binary_logloss: 5.97404e-05


LGBMClassifier(learning_rate=0.09, max_depth=-5, random_state=42)

## We need to remake the test and hidden csv the same format as the one we used to train our model, so we use the same preprocessing to them

# There is chance that the prediction dataframes are shit also, so check these as well

In [145]:
def clean_and_add_features(df0):
# ----------Runtime Minutes features------------
    df0['runtimeMinutes'] = np.where(df0['runtimeMinutes'] == "\\N", np.nan, df0["runtimeMinutes"])
    df0["runtimeMinutes"] = pd.to_numeric(df0["runtimeMinutes"],downcast='integer')

    # ----------Title features------------
    df0['originalTitle'] = np.where(df0['originalTitle'] == df0['primaryTitle'],\
                                    np.nan, df0["originalTitle"])
    df0["isForeign"] =  np.where(df0['originalTitle'].isna(),\
                                    False, True)

    # -------------------Year features--------------
    df0['startYear'] = np.where(df0['startYear'] == "\\N", df0['endYear'], df0["startYear"])
    df0["startYear"] = pd.to_numeric(df0["startYear"],downcast='integer')
    df0["yearsSinceRelease"] = 2022 - df0["startYear"]


    # Clean NULL, DROP USELESS
    df0 = df0.drop(columns=['endYear',"originalTitle","primaryTitle"])
    df0["startYear"] = df0["startYear"].astype(str)
    df0= reduce_memory(df0)

    df0["startYear"] = df0["startYear"].astype(str)
    #TODO Create a trimmed mean removal method for outliers in numVotes and runtimeMinutes
    #TODO: This should be done for each df separetely, so we can save them to the db separately and
    df0.fillna(df0.mean(), inplace=True)
    pivoted = pd.pivot_table(df0,\
                              index=["column0",\
                                     "isForeign",\
                                     "yearsSinceRelease","numVotes","tconst","runtimeMinutes"]\
                              ,columns=['startYear'],aggfunc=len,fill_value=0).reset_index()
    merged_df = pd.merge(pivoted, pivot_writer, left_on='tconst',right_on="movie_writ", how="inner")
    merged_df = pd.merge(merged_df, pivot_directors, left_on='tconst',right_on="movie_dir", how="inner")
    merged_df["isForeign"] = np.where(merged_df['isForeign'] == False,\
                              0, 1)
    merged_df.drop(["column0","tconst","movie_dir","movie_writ"], inplace=True, axis = 1)
    return merged_df

In [146]:
test = con.execute("SELECT * FROM test_hidden").fetchdf()
test = clean_and_add_features(test)
validation = con.execute("SELECT * FROM validation_hidden").fetchdf()
validation = clean_and_add_features(validation)




Mem. usage decreased to  0.03 Mb (28.1% reduction)
Mem. usage decreased to  0.03 Mb (28.1% reduction)


## Now they should be the same format so we can find the predictions

In [178]:
for col in X_train.columns:
    if col not in test:
        test[col] = 0
    if col not in validation:
        validation[col] = 0

In [189]:
# validation["movie_dir"]
validation.drop(["1914","1917"], inplace= True, axis = 1)

In [193]:
model.predict(test)

array([1, 1, 1, ..., 1, 1, 1])