In [1]:
import duckdb
import numpy as np
import pandas as pd
import pyspark
from pyspark.shell import spark
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import ArrayType, FloatType, DecimalType, StringType, IntegerType
from pyspark.sql.functions import size
from pyspark.storagelevel import StorageLevel
from pyspark import SparkContext
from pyspark.sql.functions import col,when
from pyspark.sql import SparkSession
from pyspark.pandas.spark import functions as SF

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/23 23:32:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Welcome to
      ____              __
     / __/__  ___ _____/ /__
    _\ \/ _ \/ _ `/ __/  '_/
   /__ / .__/\_,_/_/ /_/\_\   version 3.2.0
      /_/

Using Python version 3.9.7 (default, Oct 10 2021 15:08:54)
Spark context Web UI available at http://1eee0feb924b:4040
Spark context available as 'sc' (master = local[*], app id = local-1648078349103).
SparkSession available as 'spark'.




## Connect to duckDB and import data

In [2]:
con = duckdb.connect(database=':memory:')

In [3]:
# Create the tables structure:

for table in ['train', 'test', 'validation']:
    # Drop the tables if they already exist
    try:
        con.execute('''DROP TABLE ''' + table)
    except:
        pass
    
    # Create the table structures (with labels column for the train set):
    if table == 'train':
        # With label
        con.execute('''
        CREATE TABLE ''' + table + '''(num INT, tconst VARCHAR, primaryTitle VARCHAR, originalTitle VARCHAR, startYear varchar,
        endYear varchar, runtimeMinutes VARCHAR, numVotes FLOAT, label BOOL);
        ''')
    else:
        con.execute('''
        CREATE TABLE ''' + table + '''(num INT, tconst VARCHAR, primaryTitle VARCHAR, originalTitle VARCHAR, startYear varchar,
        endYear varchar, runtimeMinutes VARCHAR, numVotes FLOAT);
        ''')

In [4]:
#With copy the CSVs are appended to one table
import os
from os.path import isfile, join

path = os.getcwd() + "/data/"
files = [f for f in os.listdir(path) if isfile(join(path, f))]

for f in files:
    file = 'data/'+f
    if 'train-' in f:
        print('Reading train file...', f)
        con.execute("COPY train FROM '"+file+"' (AUTO_DETECT TRUE)")
            
    if 'test_' in f:
        print('Reading test file...', f)
        con.execute("COPY test FROM '"+file+"' (AUTO_DETECT TRUE)")
    
    if 'validation_' in f:
        print('Reading validation file...', f)
        con.execute("COPY validation FROM '"+file+"' (AUTO_DETECT TRUE)")

Reading test file... test_hidden.csv
Reading train file... train-1.csv
Reading train file... train-2.csv
Reading train file... train-3.csv
Reading train file... train-4.csv
Reading train file... train-5.csv
Reading train file... train-6.csv
Reading train file... train-7.csv
Reading train file... train-8.csv
Reading validation file... validation_hidden.csv


In [5]:
con.execute(''' SELECT * FROM train''').fetch_df()

Unnamed: 0,num,tconst,primarytitle,originaltitle,startyear,endyear,runtimeminutes,numvotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True
...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,,2019,\N,87,12951.0,False
7955,9981,tt9741310,Slaxx,Slaxx,2020,\N,77,2464.0,False
7956,9982,tt9742392,Kindred,Kindred,2020,\N,101,1719.0,False
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,,2020,\N,111,4144.0,True


Get the JSONs into different tables

In [6]:
jsonStr = 'writing.json'
# Convert JSON to DataFrame Using read_json()
try:
    df = pd.read_json(jsonStr)
except:
    df = pd.read_json('data/'+jsonStr)
con.execute("CREATE TABLE writing AS SELECT * FROM 'df'");

In [7]:

jsonStr2 = 'directing.json'
# Convert JSON to DataFrame Using read_json()
# Convert JSON to DataFrame Using read_json()
try:
    df2 = pd.read_json(jsonStr2)
except:
    df2 = pd.read_json('data/'+jsonStr2)

con.execute("CREATE TABLE directing AS SELECT * FROM 'df2'");

## From DB to Spark

In [8]:
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("IMDB") \
    .getOrCreate()

In [9]:
# Get the duckDB tables in spark

train_df=spark.createDataFrame(con.execute("SELECT * FROM train").fetchdf().where(pd.notnull(con.execute("SELECT * FROM train").fetchdf()),
                                                                                  None))
test_df=spark.createDataFrame(con.execute("SELECT * FROM test").fetchdf().where(pd.notnull(con.execute("SELECT * FROM test").fetchdf()),
                                                                                  None))
validation_df=spark.createDataFrame(con.execute("SELECT * FROM validation").fetchdf().where(pd.notnull(con.execute("SELECT * FROM validation").fetchdf()),
                                                                                  None))


## Preprocessing

In [10]:
# Drop null values from the desired columns
def drop_nulls(df, cols):
    df = df.dropna(subset=cols)
    return df

In [11]:
# Swap start and endyear where necesseary and add YearSinceRealease feature
from pyspark.sql.functions import when
def set_years(df):
    df = df.withColumn("endyear", when(df.endyear == "\\N","2022")
                                 .otherwise(df.endyear))
    df = df.withColumn("startyear", when(df.startyear == "\\N", df.endyear)
                              .otherwise(df.startyear))
    df = df.withColumn("endyear", when(df.endyear == df.startyear, "2022")
                              .otherwise(df.endyear))
    df = df.withColumn('YearSinceRealease', ( df['endyear'] - df['startyear'] ))
    return df

In [12]:
# Check runtime minutes to the mean when not available
def runtime_nulls(df):
    df = df.where(df.runtimeminutes != '\\N')

In [13]:
# Fill with mean in empty rows:
from pyspark.sql.functions import avg
def fill_with_mean(df, cols): 
    # First convert the non numeric values to None:
    for col in cols:
        df = df.withColumn(col, when(df[col] == "\\N", None)
                                 .otherwise(df[col]))
    # Then fill with the mean:
    fill_values = {column: df.agg({column:"mean"}).first()[0] for column in cols}
    df = df.na.fill(fill_values)
    return df

In [14]:
# EXECUTE THE PREPROCESSING:

def preprocessing(df):
    # Drop rows that have null values in runtimeminutes/numvotes
    # df = drop_nulls(df, ["runtimeminutes","numvotes"])
    # Swap start and endyear where necesseary, set end year and add YearSinceRealease feature
    df = set_years(df)
    # Check Runtime minutes
    df = fill_with_mean(df, ["runtimeminutes"])
    return df
    
train_proc = preprocessing(train_df)
test_proc = preprocessing(test_df)
val_proc = preprocessing(validation_df)

                                                                                

## Additional data

#### Save in DuckDB

In [15]:
try:
    con.execute("CREATE TABLE additional_train AS SELECT * FROM 'movies_info_train.csv';")
    con.execute("CREATE TABLE additional_test AS SELECT * FROM 'movies_info_test.csv';")
    con.execute("CREATE TABLE additional_val AS SELECT * FROM 'movies_info_val.csv';")
except:
    con.execute("CREATE TABLE additional_train AS SELECT * FROM 'data/movies_info_train.csv';")
    con.execute("CREATE TABLE additional_test AS SELECT * FROM 'data/movies_info_test.csv';")
    con.execute("CREATE TABLE additional_val AS SELECT * FROM 'data/movies_info_val.csv';")

#### From DB to Spark

In [16]:
# Get the duckDB tables in spark

train_extra_df=spark.createDataFrame(con.execute("SELECT * FROM additional_train").fetchdf().where(pd.notnull(con.execute("SELECT * FROM additional_train").fetchdf()),
                                                                                  None))
test_extra_df=spark.createDataFrame(con.execute("SELECT * FROM additional_test").fetchdf().where(pd.notnull(con.execute("SELECT * FROM additional_test").fetchdf()),
                                                                                  None))
validation_extra_df=spark.createDataFrame(con.execute("SELECT * FROM additional_val").fetchdf().where(pd.notnull(con.execute("SELECT * FROM additional_val").fetchdf()),
                                                                                  None))


In [17]:
writers_df=spark.createDataFrame(con.execute("SELECT * FROM writing").fetchdf().where(pd.notnull(con.execute("SELECT * FROM writing").fetchdf()),
                                                                                  None))
directors_df=spark.createDataFrame(con.execute("SELECT * FROM directing").fetchdf().where(pd.notnull(con.execute("SELECT * FROM directing").fetchdf()),
                                                                                  None))

In [18]:
from pyspark.sql.functions import collect_list
grouped_writers = writers_df.groupby('movie').agg(collect_list('writer').alias("writers"))

#### Merge all dataframe togerther (inito_numpy writers, directors, extra data)

In [19]:
from pyspark.sql.functions import monotonically_increasing_id

def merge_dfs(df, df_extra):
    horiztnlcombined_data = df.join(df_extra, df.tconst == df_extra.imdb_id, 'inner')
    print(df.count(), ' + ', df_extra.count(), ' --> ', horiztnlcombined_data.count())
    return horiztnlcombined_data

train_merge_df = merge_dfs(train_df, train_extra_df)
test_merge_df = merge_dfs(test_df, test_extra_df)
val_merge_df = merge_dfs(validation_df, validation_extra_df)

                                                                                

7959  +  7924  -->  7924


                                                                                

1086  +  1086  -->  1086


                                                                                

955  +  955  -->  955


In [20]:
directors_df = directors_df.selectExpr("movie as movie_d","director as director")
writers_directors = grouped_writers.join(directors_df, grouped_writers.movie == directors_df.movie_d, 'inner')

In [21]:

def final_merge(df, df_writ_dir):
    final_df = df.join(df_writ_dir, df.tconst == df_writ_dir.movie, 'inner')
    return final_df

train_final_df = final_merge(train_merge_df, writers_directors)
test_final_df = final_merge(test_merge_df, writers_directors)
val_final_df = final_merge(val_merge_df, writers_directors)

## Prepare for ML algorithm

#### Keep only useful columns

In [22]:
def drop_cols(df, cols):
    drop_df = df.drop(*cols)
    return drop_df

cols2drop = ('num', 'tconst', 'primarytitle', 'originaltitle', 'endyear', 'imdb_id', 'belongs_to_collection', 
        'budget', 'id', 'original_title', 'overview', 'production_companies',
         'release_date', 'revenue', 'runtime', 'tagline', 'title', 'video', 'vote_count', 'spoken_language_list',  'movie', 'movie_d')

train_df_clean = drop_cols(train_final_df, cols2drop)
test_df_clean = drop_cols(test_final_df, cols2drop)
val_df_clean = drop_cols(val_final_df, cols2drop)


#### Encoding

Trying to encode with One Hot Encoder for spark dataframe

In [23]:
#   ##  import the required libraries
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder


def hot_encode(df, col):
    ##  numeric indexing for the strings (indexing starts from 0)
    indexer = StringIndexer(inputCol=col, outputCol=col+'_ind')
    df = indexer.fit(df).transform(df)
    ohe = OneHotEncoder(inputCol=col+'_ind', outputCol=col+'OHEVector')
    df = ohe.fit(df).transform(df)
    return df

    
train_df_encode = hot_encode(train_df_clean, 'genre_list')
test_df_encode = hot_encode(test_df_clean, 'genre_list')
val_df_encode = hot_encode(val_df_clean, 'genre_list')


                                                                                

#### Convert to pandas

In [29]:
train_df_pandas = train_final_df.toPandas()
test_df_pandas = test_final_df.toPandas()
val_df_pandas = val_final_df.toPandas()

                                                                                

#### One hot encoding for features

In [98]:
from sklearn.preprocessing import MultiLabelBinarizer
    
def convert_2set(val):
    try:
        ret = set(val)
    except:
        ret = set({})
    return ret
    
def hot_encode(df, col):
    one_hot = pd.DataFrame()
    try:
        one_hot[col+'_OH'] = df[col].str.strip('[]').str.replace(' ','').str.replace("'",'').str.split(',')
    except: 
        one_hot[col+'_OH'] = df[col]
    one_hot[col+'_OH'] = one_hot[col+'_OH'].apply(lambda x: convert_2set(x))

    mlb = MultiLabelBinarizer()
    one_hot_df = pd.DataFrame(mlb.fit_transform(one_hot[col+'_OH']),columns=mlb.classes_)
    
    # Check that we only keep writers that appear at least 5 times in the dataset
    cols = list(one_hot_df.columns.values)
    for col in cols:
        if one_hot_df[col].sum() < 5:
            one_hot_df.drop(col, axis=1)
    
    joined_df = df.join(one_hot_df)
    return joined_df

In [92]:
train_encoded_df = hot_encode(train_df_pandas, 'genre_list')
# train_encoded_df = hot_encode(train_encoded_df, 'writers')

In [99]:
test_encoded_df = hot_encode(test_df_pandas, 'genre_list')
val_encoded_df = hot_encode(val_df_pandas, 'genre_list')

In [105]:
assert train_encoded_df.shape[1] == test_encoded_df.shape[1], "Test features don't match"
assert train_encoded_df.shape[1] == val_encoded_df.shape[1], "Validation features don't match"
print('Ok')

Ok


TODO: Include the directors & writers !

## Train ML Model 

In [106]:
pip install lightgbm

Note: you may need to restart the kernel to use updated packages.


In [107]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder

In [144]:
# Drop unnecessary columns:
cols2drop = ['num', 'tconst', 'primarytitle', 'originaltitle', 'imdb_id',
       'belongs_to_collection', 'budget', 'id', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'release_date', 'revenue', 'tagline', 'title', 'video', 'endyear',
        'genre_list', 'production_list', 'movie_d', 'director', '',
       'production_countr_list', 'spoken_language_list', 'movie', 'writers']

# cols2drop
x = train_encoded_df.drop(columns=cols2drop)
x = x.drop(columns=['label'])
x_test = test_encoded_df.drop(columns=cols2drop)
x_val = val_encoded_df.drop(columns=cols2drop)


In [145]:

def convert_float(df, columns):
    for col in columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0, downcast='infer')
    return df

x = convert_float(x, ['runtimeminutes', 'adult', 'startyear'])
y = train_encoded_df['label']


In [146]:
x_test = convert_float(x_test, ['runtimeminutes', 'adult', 'startyear'])
x_val = convert_float(x_val, ['runtimeminutes', 'adult', 'startyear'])

In [147]:
# train and test split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

#### Train the model

In [None]:
lgb_params = {
    'boosting_type':'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'n_estimators':10000,
    'learning_rate':0.3,
    'num_leaves':2840,
    'max_depth':10,
    'min_data_in_leaf': 300,
    'lambda_l1': 35,
    'lambda_l2': 65,
    'min_gain_to_split': 7.394615335964813,
    'bagging_fraction': 0.6,
    'bagging_freq': 1,
    'feature_fraction': 0.3
                } 
d_train=lgb.Dataset(x_train, label=y_train)

#train the model 
clf=lgb.train(lgb_params,d_train) #train the model on 100 epocs




In [None]:
# Plot the feature importance
lgb.plot_importance(clf)

In [None]:
# Save the model
clf.save_model('model_lgbm.txt')
clf.save_model("model_lgbm.json")

##### Make predictions for hidden data

In [None]:
# Load the saved model
# model_lgb = lgb.Booster(model_file='model_lgbm.txt')

In [None]:
# Prediction on the TEST set

y_pred_test=clf.predict(x_test)
#rounding the values
y_pred_test=y_pred_test.round(0)
#converting from float to integer
y_pred_test=y_pred_test.astype(int)
y_pred_test

In [None]:
# Prediction on the VALIDATION set

y_pred_val=clf.predict(x_val)
#rounding the values
y_pred_val=y_pred_tey_pred_valst.round(0)
#converting from float to integer
y_pred_val=y_pred_val.astype(int)
y_pred_val

In [None]:
### Get some metrics

In [None]:
from sklearn.metrics import mean_squared_error,roc_auc_score,precision_score, accuracy_score
#roc_auc_score metric
accuracy_score(y_pred,y_test.values.astype(int))

In [None]:
print('Training accuracy {:.4f}'.format(clf.score(x_train,y_train)))
print('Testing accuracy {:.4f}'.format(clf.score(x_test,y_test)))