In [68]:
import duckdb
import numpy as np
import pandas as pd
import pyspark
from pyspark.shell import spark
from pyspark.sql import SparkSession
from pyspark.sql import functions as func
from pyspark.sql.types import ArrayType, FloatType, DecimalType, StringType, IntegerType
from pyspark.sql.functions import size
from pyspark.storagelevel import StorageLevel
from pyspark import SparkContext
from pyspark.sql.functions import col,when
from pyspark.sql import SparkSession
from pyspark.pandas.spark import functions as SF

 - store csvs in duckdb (one or more tables?)
 - convert to spark dataframes
 - preprocess
 - feature adding
 - feed to model

#### Connect to database and get all CSVs in one table

In [74]:
con = duckdb.connect(database=':memory:')

In [298]:
#con.execute('''DROP TABLE train''')

<duckdb.DuckDBPyConnection at 0x55b8c8ecf0>

In [299]:
con.execute('''
CREATE TABLE train(num INT, tconst VARCHAR, primaryTitle VARCHAR, originalTitle VARCHAR, startYear varchar,
endYear varchar, runtimeMinutes VARCHAR, numVotes FLOAT, label BOOL);
''')

<duckdb.DuckDBPyConnection at 0x55b8c8ecf0>

In [307]:
#With copy the CSVs are appended to one table
#This was run 8 times
con.execute('''
COPY train FROM 'train-8.csv' (AUTO_DETECT TRUE)
''')

<duckdb.DuckDBPyConnection at 0x55b8c8ecf0>

In [308]:
con.execute(''' SELECT * FROM train''').fetch_df()

Unnamed: 0,num,tconst,primarytitle,originaltitle,startyear,endyear,runtimeminutes,numvotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True
...,...,...,...,...,...,...,...,...,...
7954,9966,tt9625664,Trauma Center,,2019,\N,87,12951.0,False
7955,9981,tt9741310,Slaxx,Slaxx,2020,\N,77,2464.0,False
7956,9982,tt9742392,Kindred,Kindred,2020,\N,101,1719.0,False
7957,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,,2020,\N,111,4144.0,True


In [310]:
# Re-check this! Changed the null values because I had some problems converting it to spark Df
con.execute(''' UPDATE train
SET originalTitle = 'none'
WHERE originalTitle IS NULL
''').fetch_df()

Unnamed: 0,Count
0,3988


#### From DB to Spark Df and begin preprocessing

In [110]:
from pyspark.sql import SQLContext
from pyspark import SparkContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [268]:
# This gives an error, not sure why
# from pyspark.sql import SQLContext
# sqlContext = SQLContext(sc)
# train_df = sqlContext.sql(con.execute(''' SELECT * FROM train''').fetch_df())

In [311]:
train_df=spark.createDataFrame(con.execute("SELECT * FROM train").fetch_df())

In [312]:
train_df.show()

                                                                                

+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|num|   tconst|        primarytitle|       originaltitle|startyear|endyear|runtimeminutes|numvotes|label|
+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|  4|tt0010600|            The Doll|           Die Puppe|     1919|     \N|            66|  1898.0| true|
|  7|tt0011841|       Way Down East|       Way Down East|     1920|     \N|           145|  5376.0| true|
|  9|tt0012494|             Déstiny|        Der müde Tod|     1921|     \N|            97|  5842.0| true|
| 25|tt0015163|       The Navigator|       The Navigator|     1924|     \N|            59|  9652.0| true|
| 38|tt0016220|The Phantom of th...|The Phantom of th...|     1925|     \N|            93| 17887.0| true|
| 42|tt0016630|     Báttling Bútlér|     Battling Butler|     1926|     \N|            77|  3285.0| true|
| 81|tt0021015|Juno and the Paycock|          

In [185]:
train_df.printSchema()

root
 |-- num: long (nullable = true)
 |-- tconst: string (nullable = true)
 |-- primarytitle: string (nullable = true)
 |-- originaltitle: string (nullable = true)
 |-- startyear: string (nullable = true)
 |-- endyear: string (nullable = true)
 |-- runtimeminutes: string (nullable = true)
 |-- numvotes: double (nullable = true)
 |-- label: boolean (nullable = true)



Check for duplicates (this is tricky)

In [186]:
train_df = train_df.dropDuplicates(['tconst'])
train_df.show()

                                                                                

+---+---------+--------------------+-----------------+---------+-------+--------------+--------+-----+
|num|   tconst|        primarytitle|    originaltitle|startyear|endyear|runtimeminutes|numvotes|label|
+---+---------+--------------------+-----------------+---------+-------+--------------+--------+-----+
|  2|tt0009369|              Mickey|           Mickey|     1918|     \N|            93|  1119.0|false|
|  4|tt0010600|            The Doll|        Die Puppe|     1919|     \N|            66|  1898.0| true|
|  5|tt0011439|   The Mark of Zorro|The Mark of Zorro|     1920|     \N|            79|  2439.0| true|
|  6|tt0011607|  The Parson's Widow|       Prästänkan|     1920|     \N|            94|  1264.0| true|
|  7|tt0011841|       Way Down East|    Way Down East|     1920|     \N|           145|  5376.0| true|
|  8|tt0012349|             The Kid|             none|     1921|     \N|            68|121452.0| true|
|  9|tt0012494|             Déstiny|     Der müde Tod|     1921|     \N| 

In [187]:
train_df.count()

                                                                                

7959

It seems like we don't have duplicate, but we search closer

In [203]:
train_df = train_df.dropDuplicates(['primarytitle'])
train_df.show()

22/03/21 13:06:48 WARN Executor: Issue communicating with driver in heartbeater]
java.lang.NullPointerException
	at org.apache.spark.storage.memory.MemoryStore.getSize(MemoryStore.scala:133)
	at org.apache.spark.storage.BlockManager.org$apache$spark$storage$BlockManager$$getCurrentBlockStatus(BlockManager.scala:873)
	at org.apache.spark.storage.BlockManager.$anonfun$reportAllBlocks$3(BlockManager.scala:608)
	at org.apache.spark.storage.BlockManager.$anonfun$reportAllBlocks$3$adapted(BlockManager.scala:607)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
	at org.apache.spark.storage.BlockManager.reportAllBlocks(BlockManager.scala:607)
	at org.apache.spark.storage.BlockManager.reregister(BlockManager.scala:627)
	at org.apache.spark.executor.Executor.reportHeartBeat(Executor.scala:1009)
	at org.apache.spark.executor.Executor.$anonfun$heartbeater$1(Executo

+----+----------+--------------------+--------------------+---------+-------+--------------+--------+-----+
| num|    tconst|        primarytitle|       originaltitle|startyear|endyear|runtimeminutes|numvotes|label|
+----+----------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|5705|tt10534500|               #Home|                none|       \N|   2021|           158| 10591.0| true|
|2203| tt0090556|      'night, Mớthér|      'night, Mother|     1986|     \N|            96|  2244.0| true|
|4397| tt0349047|(T)Raumschiff Sur...|                none|     2004|     \N|            87| 15303.0|false|
|7602| tt2395385|                  +1|                  +1|     2013|     \N|            96|  8065.0|false|
|5061| tt0473567|           ...Yahaan|           ...Yahaan|     2005|     \N|           142|     NaN| true|
|4583| tt0381838|              ...ing|              ...ing|     2003|     \N|           104|  2187.0| true|
|7633| tt2416424|00 Schneide

In [204]:
train_df.count()

                                                                                

7846

##### Drop rows that have null values in runtimeminutes/numvotes

In [313]:
train_df = train_df.dropna(subset=["runtimeminutes","numvotes"])
train_df.show()

                                                                                

+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|num|   tconst|        primarytitle|       originaltitle|startyear|endyear|runtimeminutes|numvotes|label|
+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+
|  4|tt0010600|            The Doll|           Die Puppe|     1919|     \N|            66|  1898.0| true|
|  7|tt0011841|       Way Down East|       Way Down East|     1920|     \N|           145|  5376.0| true|
|  9|tt0012494|             Déstiny|        Der müde Tod|     1921|     \N|            97|  5842.0| true|
| 25|tt0015163|       The Navigator|       The Navigator|     1924|     \N|            59|  9652.0| true|
| 38|tt0016220|The Phantom of th...|The Phantom of th...|     1925|     \N|            93| 17887.0| true|
| 42|tt0016630|     Báttling Bútlér|     Battling Butler|     1926|     \N|            77|  3285.0| true|
| 81|tt0021015|Juno and the Paycock|          

##### Add YearSinceRealease feature

In [283]:
from pyspark.sql.functions import when

Replace \N with 2022 in endyear

In [314]:
train_df = train_df.withColumn("endyear", when(train_df.endyear == "\\N","2022")
                                 .otherwise(train_df.endyear))

Swap start and endyear where necesseary

In [315]:
train_df = train_df.withColumn("startyear", when(train_df.startyear == "\\N", train_df.endyear)
                              .otherwise(train_df.startyear))

In [316]:
train_df = train_df.withColumn("endyear", when(train_df.endyear == train_df.startyear, "2022")
                              .otherwise(train_df.endyear))

Check if it was done correctly

In [317]:
test1 = train_df.filter(train_df.primarytitle == "The Philadelphia Story")
test1.show()

                                                                                

+---+---------+--------------------+-------------+---------+-------+--------------+--------+-----+
|num|   tconst|        primarytitle|originaltitle|startyear|endyear|runtimeminutes|numvotes|label|
+---+---------+--------------------+-------------+---------+-------+--------------+--------+-----+
|272|tt0032904|The Philadelphia ...|         none|     1940|   2022|           112| 66874.0| true|
+---+---------+--------------------+-------------+---------+-------+--------------+--------+-----+



In [174]:
test2 = train_df.filter(train_df.primarytitle == "The Thief of Bagdad")
test2.show()

                                                                                

+---+---------+-------------------+-------------+---------+-------+--------------+--------+-----+
|num|   tconst|       primarytitle|originaltitle|startyear|endyear|runtimeminutes|numvotes|label|
+---+---------+-------------------+-------------+---------+-------+--------------+--------+-----+
|279|tt0033152|The Thief of Bagdad|         none|     1940|   2022|           106| 12840.0| true|
| 31|tt0015400|The Thief of Bagdad|         none|     1924|   2022|           155|  6001.0| true|
+---+---------+-------------------+-------------+---------+-------+--------------+--------+-----+



#### It works!

New feature: YearSinceRelease

In [288]:
train_df = train_df.withColumn('YearSinceRealease', ( train_df['endyear'] - train_df['startyear'] ))

In [289]:
train_df.show()

                                                                                

+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+-----------------+
|num|   tconst|        primarytitle|       originaltitle|startyear|endyear|runtimeminutes|numvotes|label|YearSinceRealease|
+---+---------+--------------------+--------------------+---------+-------+--------------+--------+-----+-----------------+
|  4|tt0010600|            The Doll|           Die Puppe|     1919|   2022|            66|  1898.0| true|            103.0|
|  7|tt0011841|       Way Down East|       Way Down East|     1920|   2022|           145|  5376.0| true|            102.0|
|  9|tt0012494|             Déstiny|        Der müde Tod|     1921|   2022|            97|  5842.0| true|            101.0|
| 25|tt0015163|       The Navigator|       The Navigator|     1924|   2022|            59|  9652.0| true|             98.0|
| 38|tt0016220|The Phantom of th...|The Phantom of th...|     1925|   2022|            93| 17887.0| true|             97.0|
| 42|tt0

Convert it to pandas dataframe in order to feed it to the model

In [290]:
pandas_train = train_df.toPandas()

                                                                                

In [291]:
pandas_train

Unnamed: 0,num,tconst,primarytitle,originaltitle,startyear,endyear,runtimeminutes,numvotes,label,YearSinceRealease
0,4,tt0010600,The Doll,Die Puppe,1919,2022,66,1898.0,True,103.0
1,7,tt0011841,Way Down East,Way Down East,1920,2022,145,5376.0,True,102.0
2,9,tt0012494,Déstiny,Der müde Tod,1921,2022,97,5842.0,True,101.0
3,25,tt0015163,The Navigator,The Navigator,1924,2022,59,9652.0,True,98.0
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,2022,93,17887.0,True,97.0
...,...,...,...,...,...,...,...,...,...,...
7164,9966,tt9625664,Trauma Center,none,2019,2022,87,12951.0,False,3.0
7165,9981,tt9741310,Slaxx,Slaxx,2020,2022,77,2464.0,False,2.0
7166,9982,tt9742392,Kindred,Kindred,2020,2022,101,1719.0,False,2.0
7167,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,none,2020,2022,111,4144.0,True,2.0


#### Additional data

In [196]:
con.execute('''
CREATE TABLE additional_data AS SELECT * FROM 'movies_info.csv';
''')

<duckdb.DuckDBPyConnection at 0x55b8c8ecf0>

In [272]:
con.execute('''
SELECT * FROM additional_data''').fetch_df()

Unnamed: 0,imdb_id,adult,belongs_to_collection,budget,id,original_language,original_title,overview,popularity,production_companies,...,runtime,tagline,title,video,vote_average,vote_count,genre_list,production_list,production_countr_list,spoken_language_list
0,tt0010600,False,,0.0,48256.0,de,Die Puppe,The misadventures of an effete young man who m...,4.861,"[{'id': 12950, 'logo_path': None, 'name': 'Pro...",...,66.0,,The Doll,False,7.3,60.0,"['Comedy', 'Fantasy']",['Projektions-AG Union (PAGU)'],['DE'],[]
1,tt0011841,False,,0.0,31509.0,en,Way Down East,A naive country girl is tricked into a sham ma...,7.617,"[{'id': 4759, 'logo_path': None, 'name': 'D.W....",...,145.0,A simple story for plain people.,Way Down East,False,7.0,70.0,"['Drama', 'Action', 'Romance']","['D.W. Griffith Productions', 'United Artists']",['US'],[]
2,tt0012494,False,,0.0,29267.0,de,Der müde Tod,As a young couple stops and rests in a small v...,7.593,"[{'id': 6762, 'logo_path': None, 'name': 'Decl...",...,105.0,Love is Stronger Than Death,Destiny,False,7.5,107.0,"['Drama', 'Fantasy', 'Thriller']",['Decla-Bioscop'],['DE'],['de']
3,tt0015163,False,,0.0,32318.0,en,The Navigator,The wealthy and impulsive Rollo Treadway decid...,7.881,"[{'id': 12190, 'logo_path': None, 'name': 'Bus...",...,65.0,"Hurry! Hurry! Throw out the ""laff"" line!",The Navigator,False,7.3,152.0,"['Action', 'Comedy', 'Romance']",['Buster Keaton Productions'],['US'],[]
4,tt0016220,False,,0.0,964.0,en,The Phantom of the Opera,"A grotesquely disfigured composer known as ""Th...",12.830,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,101.0,The greatest horror film of modern cinema!,The Phantom of the Opera,False,7.2,243.0,"['Drama', 'Horror']",['Universal Pictures'],['US'],[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7919,tt9625664,False,,0.0,641790.0,en,Trauma Center,Lt. Wakes is a vengeful police detective deter...,21.290,"[{'id': 121204, 'logo_path': None, 'name': 'Bo...",...,87.0,The enemy is closer than you think,Trauma Center,False,5.7,310.0,"['Action', 'Thriller']","['BondIt Media Capital', 'Buffalo 8', 'Pimient...",['US'],"['en', 'es']"
7920,tt9741310,False,,0.0,605133.0,en,Slaxx,When a possessed pair of jeans begins to kill ...,7.712,"[{'id': 62055, 'logo_path': None, 'name': 'EMA...",...,77.0,An ass to die for.,Slaxx,False,5.8,98.0,"['Comedy', 'Horror']","['EMAfilms', 'Entertainment Squad']",['CA'],"['en', 'hi']"
7921,tt9742392,False,,0.0,717672.0,en,Kindred,When her boyfriend Ben suddenly dies in an acc...,5.826,"[{'id': 137874, 'logo_path': None, 'name': 'Re...",...,101.0,Family is everything.,Kindred,False,5.9,11.0,"['Drama', 'Mystery', 'Horror', 'Thriller']","['Reiver Pictures', 'IFC Midnight', 'Head Gear...",['GB'],['en']
7922,tt9850386,False,,0.0,730009.0,en,The Bee Gees: How Can You Mend a Broken Heart,The story of the triumphs and hurdles of broth...,9.072,"[{'id': 862, 'logo_path': '/udTjbqPmcTbfrihMuL...",...,111.0,,The Bee Gees: How Can You Mend a Broken Heart,False,8.0,37.0,"['Music', 'Documentary']","['The Kennedy/Marshall Company', 'White Horse ...",['US'],['en']


Has issues with some features of the table.

LISTS cannot be stored as a single value in relational databases!!!
Find solution for this!

In [192]:
moredata_df=spark.createDataFrame(con.execute("SELECT * FROM additional_data").fetchdf())

TypeError: field tagline: Can not merge type <class 'pyspark.sql.types.DoubleType'> and <class 'pyspark.sql.types.StringType'>

In [None]:
output = df1.join(df2,['EMP_CODE'],how='inner').distinct()

#### ML Model 

In [292]:
pandas_train

Unnamed: 0,num,tconst,primarytitle,originaltitle,startyear,endyear,runtimeminutes,numvotes,label,YearSinceRealease
0,4,tt0010600,The Doll,Die Puppe,1919,2022,66,1898.0,True,103.0
1,7,tt0011841,Way Down East,Way Down East,1920,2022,145,5376.0,True,102.0
2,9,tt0012494,Déstiny,Der müde Tod,1921,2022,97,5842.0,True,101.0
3,25,tt0015163,The Navigator,The Navigator,1924,2022,59,9652.0,True,98.0
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,2022,93,17887.0,True,97.0
...,...,...,...,...,...,...,...,...,...,...
7164,9966,tt9625664,Trauma Center,none,2019,2022,87,12951.0,False,3.0
7165,9981,tt9741310,Slaxx,Slaxx,2020,2022,77,2464.0,False,2.0
7166,9982,tt9742392,Kindred,Kindred,2020,2022,101,1719.0,False,2.0
7167,9996,tt9850386,The Bee Gees: How Can You Mend a Broken Heart,none,2020,2022,111,4144.0,True,2.0


In [213]:
pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-3.3.2-py3-none-manylinux2014_aarch64.whl (2.1 MB)
     |████████████████████████████████| 2.1 MB 1.4 MB/s            
Installing collected packages: lightgbm
Successfully installed lightgbm-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [214]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [293]:
# To define the input and output feature
x = pandas_train.drop(['num','tconst','primarytitle','originaltitle', 'endyear',],axis=1)
x['runtimeminutes'] = x['runtimeminutes'].astype(float)
y = pandas_train['label']
# train and test split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

ValueError: could not convert string to float: '\\N'

In [217]:
model = lgb.LGBMClassifier(learning_rate=0.09,max_depth=-5,random_state=42)
model.fit(x_train,y_train,eval_set=[(x_test,y_test),(x_train,y_train)],
          verbose=20,eval_metric='logloss')



ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: tconst, primarytitle, startyear, runtimeminutes

In [None]:
print('Training accuracy {:.4f}'.format(model.score(x_train,y_train)))
print('Testing accuracy {:.4f}'.format(model.score(x_test,y_test)))