In [1]:
from pyspark.sql.functions import col,when,lit
from pyspark.sql import SparkSession
import os
import duckdb
import pandas as pd
import numpy as np
import math
import lightgbm as lgb
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.metrics import auc, accuracy_score, roc_auc_score, roc_curve,log_loss
from sklearn.model_selection import GridSearchCV
import optuna  # pip install optuna
from optuna.integration import LightGBMPruningCallback


In [2]:
#Reduce memory function always useful
def reduce_memory(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                       df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df



In [3]:



#Create PySpark SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("IMDB") \
    .getOrCreate()


#Connecting with database
con = duckdb.connect(database='my-db.duckdb', read_only=False)


# CSV MANIPULATIONS 

In [4]:
# This should be done automatically by identifying .csv files, then check if table exists
# If table exists then it should append and not create new
#TODO

# Hint: Use something similar to this

# files = [f for f in os.listdir('.') if os.path.isfile(f)]
# for f in files:
#     filename = f.split(".")
#     if filename[1] == "csv" and ("train" in filename[0]):
#         con.execute("CREATE TABLE filename[0] AS SELECT * FROM f");
#         con.execute("SELECT * FROM filename[0]").fetchdf()
con.execute("CREATE TABLE extra_test AS SELECT * FROM 'movies_info_test.csv'");
con.execute("SELECT * FROM extra_test").fetchdf()

con.execute("CREATE TABLE extra_val AS SELECT * FROM 'movies_info_val.csv'");
con.execute("SELECT * FROM extra_val").fetchdf()

con.execute("CREATE TABLE extra AS SELECT * FROM 'movies_info.csv'");
con.execute("SELECT * FROM extra").fetchdf()

con.execute("CREATE TABLE test_hidden AS SELECT * FROM 'test_hidden.csv'");
con.execute("SELECT * FROM test_hidden").fetchdf()

con.execute("CREATE TABLE train_1 AS SELECT * FROM 'train-1.csv'");
con.execute("SELECT * FROM train_1").fetchdf()

con.execute("CREATE TABLE train_2 AS SELECT * FROM 'train-2.csv'");
con.execute("SELECT * FROM train_2").fetchdf()

con.execute("CREATE TABLE train_3 AS SELECT * FROM 'train-3.csv'");
con.execute("SELECT * FROM train_3").fetchdf()

con.execute("CREATE TABLE train_4 AS SELECT * FROM 'train-4.csv'");
con.execute("SELECT * FROM train_4").fetchdf()

con.execute("CREATE TABLE train_5 AS SELECT * FROM 'train-5.csv'");
con.execute("SELECT * FROM train_5").fetchdf()

con.execute("CREATE TABLE train_6 AS SELECT * FROM 'train-6.csv'");
con.execute("SELECT * FROM train_6").fetchdf()

con.execute("CREATE TABLE train_7 AS SELECT * FROM 'train-7.csv'");
con.execute("SELECT * FROM train_7").fetchdf()

con.execute("CREATE TABLE train_8 AS SELECT * FROM 'train-8.csv'");
con.execute("SELECT * FROM train_8").fetchdf()

con.execute("CREATE TABLE validation_hidden AS SELECT * FROM 'validation_hidden.csv'");
con.execute("SELECT * FROM validation_hidden").fetchdf()







RuntimeError: Catalog Error: Table with name "extra_test" already exists!

# JSON MANIPULATIONS

In [8]:
jsonStr = 'writing.json'
writing_df = pd.read_json(jsonStr)
jsonStr = 'directing.json'
directing_df = pd.read_json(jsonStr)
# Convert JSON to DataFrame Using read_json()
writing_df = pd.read_json(jsonStr)
con.execute("CREATE TABLE writing AS SELECT * FROM 'writing_df'");

jsonStr = 'directing.json'
# Convert JSON to DataFrame Using read_json()
directing_df = pd.read_json(jsonStr)
con.execute("CREATE TABLE directing AS SELECT * FROM 'directing_df'");






RuntimeError: Catalog Error: Table with name "writing" already exists!

In [34]:
#TODO : This should also be done programmatically and not manually
columns_to_drop = ["vote_average","title","tagline", "id",\
                              "belongs_to_collection","original_title","release_date",\
                              "overview","runtime","popularity","production_companies","production_list"]

df01 = con.execute("SELECT * FROM train_1").fetchdf()
df02 = con.execute("SELECT * FROM train_2").fetchdf()
df03 = con.execute("SELECT * FROM train_3").fetchdf()
df04 = con.execute("SELECT * FROM train_4").fetchdf()
df05 = con.execute("SELECT * FROM train_5").fetchdf()
df06 = con.execute("SELECT * FROM train_6").fetchdf()
df07 = con.execute("SELECT * FROM train_7").fetchdf()
df08 = con.execute("SELECT * FROM train_8").fetchdf()
director_df = con.execute("SELECT * FROM directing").fetchdf()
writing_df = con.execute("SELECT * FROM writing").fetchdf()
extra_df = con.execute("SELECT * FROM extra").fetchdf()
extra_test_df = con.execute("SELECT * FROM extra_test").fetchdf()
extra_val_df = con.execute("SELECT * FROM extra_val").fetchdf()

df0 = pd.concat([df01,df02,df03,df04,df05,df06,df07,df08], axis = 0)
extra_df = extra_df.drop(columns=columns_to_drop)
extra_test_df = extra_test_df.drop(columns=columns_to_drop)
extra_val_df = extra_val_df.drop(columns=columns_to_drop)

In [35]:

# ----------Runtime Minutes features------------
df0['runtimeMinutes'] = np.where(df0['runtimeMinutes'] == "\\N", np.nan, df0["runtimeMinutes"])
df0["runtimeMinutes"] = pd.to_numeric(df0["runtimeMinutes"],downcast='integer')

# ----------Title features------------
df0['originalTitle'] = np.where(df0['originalTitle'] == df0['primaryTitle'],\
                                np.nan, df0["originalTitle"])
df0["isForeign"] =  np.where(df0['originalTitle'].isna(),\
                                False, True)

# -------------------Year features--------------
df0['startYear'] = np.where(df0['startYear'] == "\\N", df0['endYear'], df0["startYear"])
df0["startYear"] = pd.to_numeric(df0["startYear"],downcast='integer')
df0["yearsSinceRelease"] = 2022 - df0["startYear"]


# Clean NULL, DROP USELESS
df0 = df0.dropna(how = "any",subset=["numVotes","label","runtimeMinutes"])
df0 = df0.dropna(how = "all",subset=["primaryTitle","originalTitle"])
df0 = df0.drop(columns=['endYear',"originalTitle","primaryTitle"])
df0= reduce_memory(df0)

#TODO Create a trimmed mean removal method for outliers in numVotes and runtimeMinutes
#TODO: This should be done for each df separetely, so we can save them to the db separately and
try:
    con.execute("DROP TABLE clean_data");
    con.execute("CREATE TABLE clean_data AS SELECT * FROM 'df0'");
except:
    con.execute("CREATE TABLE clean_data AS SELECT * FROM 'df0'");
# -------------------Writing features--------------
writing_df["writer"] =  np.where(writing_df['writer'] == "\\N",\
                                np.nan, writing_df["writer"])
writing_df = writing_df.dropna(how = "any",subset=["writer"])
try:
    con.execute("DROP TABLE clean_writers");
    con.execute("CREATE TABLE clean_writers AS SELECT * FROM 'writing_df'");
except:
    con.execute("CREATE TABLE clean_writers AS SELECT * FROM 'writing_df'");
    
# -------------------Director features--------------
directing_df["director"] =  np.where(directing_df['director'] == "\\N",\
                                np.nan, directing_df["director"])
directing_df = directing_df.dropna(how = "any",subset=["director"])
try:
    con.execute("DROP TABLE clean_directors");
    con.execute("CREATE TABLE clean_directors AS SELECT * FROM 'directing_df'");
except:
    con.execute("CREATE TABLE clean_directors AS SELECT * FROM 'directing_df'");
    


Mem. usage decreased to  0.21 Mb (26.2% reduction)


### Everything is saved clean to the DB as of now

#### Now preprocess for model (Should be done in PySpark)

In [36]:
writer1h = pd.pivot_table(writing_df,\
                              index=["movie"]\
                              ,columns=['writer'],aggfunc=len,fill_value=0).reset_index()
director1h =  pd.pivot_table(directing_df,\
                              index=["movie"]\
                              ,columns=['director'],aggfunc=len,fill_value=0).reset_index()
director1h = director1h.add_suffix("_dir")

merged_df = pd.merge(df0, writer1h, left_on='tconst',right_on="movie", how="inner")
merged_df = pd.merge(merged_df, director1h, left_on='tconst',right_on="movie_dir", how="inner")
merged_df = merged_df.dropna(how = "any",subset=["tconst","numVotes","label"])


In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

def one_hot_extras(extra_df):
    df = extra_df
    mlb = MultiLabelBinarizer()

#     df["production_countr_list"] = df["production_countr_list"].str.\
#         strip('[]').str.replace(' ','').str.replace("'",'').str.split(',')
    df["genre_list"] = df["genre_list"].str.\
        strip('[]').str.replace(' ','').str.replace("'",'').str.split(',')
#     df["spoken_language_list"] = df["spoken_language_list"].str.\
#         strip('[]').str.replace(' ','').str.replace("'",'').str.split(',')
#     df = df.join(
#                 pd.DataFrame.sparse.from_spmatrix(
#                     mlb.fit_transform(df.pop('production_countr_list')),
#                     index=df.index,
#                     columns=mlb.classes_),rsuffix="_drod_count_list")
#     df = df.join(
#                 pd.DataFrame.sparse.from_spmatrix(
#                     mlb.fit_transform(df.pop('spoken_language_list')),
#                     index=df.index,
#                     columns=mlb.classes_),rsuffix="_sp_lan_list")

    df['genre_list'] = df['genre_list'].apply(lambda x: set(x))
    df = pd.DataFrame(mlb.fit_transform(df['genre_list']),columns=mlb.classes_)
    df= extra_df.join(df)
    df = df.drop(columns = ["genre_list"],  axis = 1)
    
#     df['production_countr_list'] = df['production_countr_list'].apply(lambda x: set(x))
#     df = pd.DataFrame(mlb.fit_transform(df['production_countr_list']),columns=mlb.classes_)
#     df= extra_df.join(df,rsuffix = "_prcount")
#     df =df.drop(columns =["production_countr_list"],  axis = 1)
    
    
#     df['spoken_language_list'] = df['spoken_language_list'].apply(lambda x: set(x))
#     df = pd.DataFrame(mlb.fit_transform(df['spoken_language_list']),columns=mlb.classes_)
#     df= extra_df.join(df,rsuffix = "_spknlng")
#     df =df.drop(columns =["spoken_language_list"],  axis = 1)
    
    return df

In [37]:
all_extras = pd.concat([extra_df, extra_test_df,extra_val_df])
ex1h = one_hot_extras(all_extras)
ex1h.drop(columns=["production_countr_list","spoken_language_list",\
                   "original_language"], axis=1, inplace=True)
# 9952 rows × 1815 columns

In [38]:
merged_df_ex = merged_df.merge(ex1h, left_on="tconst", right_on="imdb_id", how="left")
merged_df_ex["adult"] = merged_df_ex["adult"].astype(bool)
merged_df_ex["video"] = merged_df_ex["video"].astype(bool)
merged_df = merged_df_ex 


In [57]:

merged_df.drop("", axis = 1 , inplace = True)

## Check here if everything is as it should be, merged_df is the final dataframe where it has all features and everything. If anything is wrong in the dataframe you can find it here.

#### Split the train set

In [86]:

Y_train = merged_df['label']
X_train = merged_df.drop(['column0',"label","tconst","movie","movie_dir","imdb_id"],axis=1)
x_train,x_test,y_train,y_test = train_test_split(X_train,Y_train,test_size=0.20,random_state=42)
d_train=lgb.Dataset(x_train, label=y_train)
lgb_params = {
                    'objective': 'binary',
                    'metric': 'binary_logloss',
    'n_estimators':1000,
    'learning_rate':0.3,
    'num_leaves':2840,
    'max_depth':10,
    'min_data_in_leaf': 300,
'lambda_l1': 35,
'lambda_l2': 65,
'min_gain_to_split': 7.394615335964813,
'bagging_fraction': 0.6,
'bagging_freq': 1,
'feature_fraction': 0.3
    
                } 
clf=lgb.train(lgb_params,d_train)



[LightGBM] [Info] Number of positive: 2726, number of negative: 2830
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1382
[LightGBM] [Info] Number of data points in the train set: 5556, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.490641 -> initscore=-0.037441
[LightGBM] [Info] Start training from score -0.037441


























































































In [87]:


from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score, accuracy_score
#roc_auc_score metric



#prediction on the test set
y_pred=clf.predict(x_test)
#rounding the values
y_pred=y_pred.round(0)
#converting from float to integer
y_pred=y_pred.astype(int)
accuracy_score(y_pred,y_test.values.astype(int))



0.7789776817854571

#### Hyperparameter Tuning ----SKIP THIS FOR NOW----------

In [96]:
def objective(trial, X, y):
    param_grid = {
        # "device_type": trial.suggest_categorical("device_type", ['gpu']),
        "boosting" : trial.suggest_categorical('boosting', ["gbdt","rf","dart"]),
        "n_estimators": trial.suggest_categorical("n_estimators", [1000]),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 3000, step=20),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 10000, step=10),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 100, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 100, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "max_bin": trial.suggest_int("max_bin", 10, 400, step =20),
        "bagging_fraction": trial.suggest_float(
            "bagging_fraction", 0.2, 0.95, step=0.1
        ),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float(
            "feature_fraction", 0.2, 0.95, step=0.1
        ),
    }

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        model = lgb.LGBMClassifier(objective="binary", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric="binary_logloss",
            early_stopping_rounds=200,
            callbacks=[
                LightGBMPruningCallback(trial, "binary_logloss")
            ],  # Add a pruning callback
        )
        preds = model.predict_proba(X_test)
        cv_scores[idx] = log_loss(y_test, preds)

    return np.mean(cv_scores)

In [None]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X_train, Y_train)
study.optimize(func, n_trials=20)

[32m[I 2022-03-24 03:48:01,159][0m A new study created in memory with name: LGBM Classifier[0m
























[32m[I 2022-03-24 03:48:57,560][0m Trial 0 finished with value: 0.6930053623237463 and parameters: {'boosting': 'gbdt', 'n_estimators': 1000, 'learning_rate': 0.053711873998260114, 'num_leaves': 800, 'max_depth': 3, 'min_data_in_leaf': 2870, 'lambda_l1': 80, 'lambda_l2': 50, 'min_gain_to_split': 4.54722837961938, 'max_bin': 390, 'bagging_fraction': 0.5, 'bagging_freq': 1, 'feature_fraction': 0.5}. Best is trial 0 with value: 0.6930053623237463.[0m






















In [90]:
print(f"\tBest value (rmse): {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value (rmse): 0.41878
	Best params:
		n_estimators: 1000
		learning_rate: 0.013317631515790214
		num_leaves: 2200
		max_depth: 12
		min_data_in_leaf: 400
		lambda_l1: 5
		lambda_l2: 35
		min_gain_to_split: 4.534372799685827
		bagging_fraction: 0.8
		bagging_freq: 1
		feature_fraction: 0.9


## We need to remake the test and hidden csv the same format as the one we used to train our model, so we use the same preprocessing to them

In [63]:
def clean_and_add_features(df0):
    df0['runtimeMinutes'] = np.where(df0['runtimeMinutes'] == "\\N", np.nan, df0["runtimeMinutes"])
    df0["runtimeMinutes"] = pd.to_numeric(df0["runtimeMinutes"],downcast='integer')

    # ----------Title features------------
    df0['originalTitle'] = np.where(df0['originalTitle'] == df0['primaryTitle'],\
                                    np.nan, df0["originalTitle"])
    df0["isForeign"] =  np.where(df0['originalTitle'].isna(),\
                                    False, True)

    # -------------------Year features--------------
    df0['startYear'] = np.where(df0['startYear'] == "\\N", df0['endYear'], df0["startYear"])
    df0["startYear"] = pd.to_numeric(df0["startYear"],downcast='integer')
    df0["yearsSinceRelease"] = 2022 - df0["startYear"]


    # Clean NULL, DROP USELESS
    df0 = df0.fillna(value=df0.mean())
    df0 = df0.dropna(how = "all",subset=["primaryTitle","originalTitle"])
    df0 = df0.drop(columns=['endYear',"originalTitle","primaryTitle"])
    df0= reduce_memory(df0)
    merged_df = pd.merge(df0, writer1h, left_on='tconst',right_on="movie", how="left")
    merged_df = pd.merge(merged_df, director1h, left_on='tconst',right_on="movie_dir", how="left")
    merged_df = merged_df.dropna(how = "any",subset=["tconst","numVotes"])
    merged_df = merged_df.merge(ex1h, left_on="tconst", right_on="imdb_id", how="left")
    merged_df["adult"] = merged_df["adult"].astype(bool)
    merged_df["video"] = merged_df["video"].astype(bool)

    return merged_df

### Create the tables and then the predictions. After that, save them

In [78]:
test = con.execute("SELECT * FROM test_hidden").fetchdf()
test = clean_and_add_features(test)
test = test.drop(['column0',"tconst","movie","movie_dir","imdb_id",""],axis=1)

test_pred=clf.predict(test)
test_pred=test_pred.round(0)


validation = con.execute("SELECT * FROM validation_hidden").fetchdf()
validation = clean_and_add_features(validation)

validation = validation.drop(['column0',"tconst","movie","movie_dir","imdb_id",""],axis=1)
val_pred=clf.predict(validation)
val_pred=val_pred.round(0)




np.savetxt("test111.csv",test_pred.astype(bool), fmt='%s')
np.savetxt("val111.csv",val_pred.astype(bool), fmt='%s')


Mem. usage decreased to  0.03 Mb (26.8% reduction)
      startYear  runtimeMinutes      numVotes  isForeign  yearsSinceRelease  \
0          1924            95.0   3654.000000      False                 98   
1          1924           150.0   2136.000000      False                 98   
2          1924           129.0   4341.000000      False                 98   
3          1925            59.0   1724.000000      False                 97   
4          1925            69.0   4188.000000      False                 97   
...         ...             ...           ...        ...                ...   
1081       2019           101.0   5109.000000      False                  3   
1082       2020           106.0   5780.000000      False                  2   
1083       2019           149.0   2331.000000       True                  3   
1084       2020           110.0  30276.921875      False                  2   
1085       2019           140.0   5271.000000       True                  3   



In [73]:
merged_df.drop(columns =["column0","tconst","label",'movie'], axis =1)

Unnamed: 0,startYear,runtimeMinutes,numVotes,isForeign,yearsSinceRelease,nm0000005,nm0000019,nm0000027,nm0000033,nm0000036,...,History,Horror,Music,Mystery,Romance,ScienceFiction,TVMovie,Thriller,War,Western
0,1919,66.0,1898.0,True,103,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1920,145.0,5376.0,False,102,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,1921,97.0,5842.0,True,101,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1924,59.0,9652.0,False,98,0,0,0,0,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,1925,93.0,17887.0,False,97,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6940,2019,87.0,12951.0,False,3,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6941,2020,77.0,2464.0,False,2,0,0,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6942,2020,101.0,1719.0,False,2,0,0,0,0,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
6943,2020,111.0,4144.0,False,2,0,0,0,0,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
merged_df.columns.symmetric_difference(test.columns)

Index(['', 'column0', 'imdb_id', 'label', 'movie', 'movie_dir', 'tconst'], dtype='object')