In [1]:
# IMPORT ALL NECESSARY PACKAGES

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
import pandas as pd
import numpy as np
import json

In [6]:
# READ THE TRAIN/TEST/VAL DATASETS:

from os import listdir
from os.path import isfile, join

path = "../data/"
files = [f for f in listdir(path) if isfile(join(path, f))]

train_movies_df = None
for f in files:
    if 'train-' in f:
        print('Reading train file...', f)
        try:
            df = pd.read_csv(path + "/" + f)
        except:
            df = pd.read_csv(path + "/" + f, sep=';')
        try:
            train_movies_df = pd.concat([train_movies_df, df])
        except:
            # First file
            train_movies_df = df
            
    if 'test_' in f:
        print('Reading test file...', f)
        try:
            test_movies_df = pd.read_csv(path + "/" + f)
        except:
            test_movies_df = pd.read_csv(path + "/" + f, sep=';')
    
    if 'validation_' in f:
        print('Reading validation file...', f)
        try:
            validation_movies_df = pd.read_csv(path + "/" + f)
        except:
            validation_movies_df = pd.read_csv(path + "/" + f, sep=';')
        
print('Train size:', train_movies_df.shape[0])
display(train_movies_df.head(5))
display(train_movies_df.info())

Reading test file... test_hidden.csv
Reading train file... train-1.csv
Reading train file... train-2.csv
Reading train file... train-3.csv
Reading train file... train-4.csv
Reading train file... train-5.csv
Reading train file... train-6.csv
Reading train file... train-7.csv
Reading train file... train-8.csv
Reading validation file... validation_hidden.csv
Train size: 7959


Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,4,tt0010600,The Doll,Die Puppe,1919,\N,66,1898.0,True
1,7,tt0011841,Way Down East,Way Down East,1920,\N,145,5376.0,True
2,9,tt0012494,Déstiny,Der müde Tod,1921,\N,97,5842.0,True
3,25,tt0015163,The Navigator,The Navigator,1924,\N,59,9652.0,True
4,38,tt0016220,The Phantom of the Opera,The Phantom of the Opera,1925,\N,93,17887.0,True


<class 'pandas.core.frame.DataFrame'>
Int64Index: 7959 entries, 0 to 992
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      7959 non-null   int64  
 1   tconst          7959 non-null   object 
 2   primaryTitle    7959 non-null   object 
 3   originalTitle   3971 non-null   object 
 4   startYear       7959 non-null   object 
 5   endYear         7959 non-null   object 
 6   runtimeMinutes  7959 non-null   object 
 7   numVotes        7169 non-null   float64
 8   label           7959 non-null   bool   
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 567.4+ KB


None

In [None]:
df = pd.read_csv("../data/train-1.csv")
len(df)
df[df['tconst']=='tt0023986']

In [None]:
# GET THE WRITINGS INFORMATION:

file = f = open('../data/writing.json')
list_writings = json.load(file)

# list_2df = [[dict_['movie'], dict_['writer']] for dict_ in list_writings]
# df_writings = pd.DataFrame(list_2df, columns=['movie', 'writer'])
df_writings = pd.DataFrame(list_writings, columns=['movie', 'writer'])

print('Writings size:', df_writings.size)
print('Unique writers:', len(df_writings['writer'].unique()))
display(df_writings.head(5))

In [None]:
# A BIT OF EDA:
print('Votes mean:', train_movies_df['numVotes'].mean())
print('Votes median:', train_movies_df['numVotes'].median())
print('Votes min/max:', train_movies_df['numVotes'].min(), train_movies_df['numVotes'].max())
print('---')

train_movies_df.astype({'startYear': 'int32', 'endYear': 'int32'}, errors='ignore').dtypes


print('Min/max start year:', train_movies_df['startYear'].min(), train_movies_df['startYear'].max())
print('Min/max end year:', train_movies_df['endYear'].min(), train_movies_df['startYear'].max())

In [None]:
# train_movies_df[train_movies_df['endYear']=='\\N'].apply(train_movies_df['startYear'])
train_movies_df.startYear.replace('\\N',df.endYear,inplace=True)

In [None]:
train_movies_df['endYear']

In [None]:
def check_int(value):
    try:
        int(value)
        return int(value)
    except ValueError:
        return 0
train_movies_df['endYear'] = train_movies_df['endYear'].apply(check_int)

# Multi-label classification for genre extraction

https://imdb-api.com/en/API/Title/k_2qlsgnp9/tt0015224

['id', 'title', 'originalTitle', 'fullTitle', 'type', 'year', 'image', 'releaseDate', 'runtimeMins', 'runtimeStr', 'plot', 'plotLocal', 'plotLocalIsRtl', 'awards', 'directors', 'directorList', 'writers', 'writerList', 'stars', 'starList', 'actorList', 'fullCast', 'genres', 'genreList', 'companies', 'companyList', 'countries', 'countryList', 'languages', 'languageList', 'contentRating', 'imDbRating', 'imDbRatingVotes', 'metacriticRating', 'ratings', 'wikipedia', 'posters', 'images', 'trailer', 'boxOffice', 'tagline', 'keywords', 'keywordList', 'similars', 'tvSeriesInfo', 'tvEpisodeInfo', 'errorMessage']

In [None]:
import http.client
import mimetypes
import json

In [None]:
conn = http.client.HTTPSConnection("imdb-api.com", 443)
api_key = 'k_2qlsgnp9'
payload = ''
headers = {}

In [None]:
def get_movie_info(movie_id):
    conn.request("GET", "/en/API/Title/"+api_key+"/"+movie_id, payload, headers)
    res = conn.getresponse()
    data = res.read()
    data_json = data.decode("utf-8")
    return json.loads(data_json)

def get_genres(movie_id):
    try:
        info = get_movie_info(movie_id)
        return info['genres']
    except Exception as e:
        print(e.message, e.args)
        return e.args

In [None]:
movie_id = 'tt0015224' # Example: Peter Pan
get_genres(movie_id)

In [None]:
movies_wGenre = train_movies_df
movies_wGenre['genre'] = movies_wGenre['tconst'].apply((lambda x: get_genres(x)))
movies_wGenre.head(5)

In [None]:
display(movies_wGenre.shape[0])
display(movies_wGenre[pd.isna(movies_wGenre['genre'])].shape[0])

In [None]:
list_genres = movies_wGenre[[not i for i in pd.isna(movies_wGenre['genre'])]]['genre']
gens = [gen.strip() for movie in list_genres for gen in movie.split(',')]
print('Num of genres: ', len(set(gens)))
set(gens)

## Genre from another API

https://api.themoviedb.org/3/genre/movie/list?api_key=40f5afa6d3b4da3ef1e32a4e4bb3fe20


In [74]:
import http.client
import mimetypes
import json

In [75]:
conn = http.client.HTTPSConnection("api.themoviedb.org")
api_key = '40f5afa6d3b4da3ef1e32a4e4bb3fe20'
payload = ''
headers = {
    'Content-Type': 'application/json;charset=utf-8',
    'Authorization': 'Bearer <<access_token>>'
}

In [76]:
conn.request("GET", "/3/movie/tt0015224?api_key="+api_key, payload, headers)
res = conn.getresponse()
data = res.read()
data_json = data.decode("utf-8")
json.loads(data_json)

{'adult': False,
 'backdrop_path': '/43vkdSbAd3n8c0RyOi8KR9UTEoY.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 10751, 'name': 'Family'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 14, 'name': 'Fantasy'}],
 'homepage': '',
 'id': 120672,
 'imdb_id': 'tt0015224',
 'original_language': 'en',
 'original_title': 'Peter Pan',
 'overview': 'Peter Pan enters the nursery of the Darling children and, with the help of fairy dust, leads them off to Never Never Land, where they meet the nefarious Captain Hook.',
 'popularity': 2.732,
 'poster_path': '/3JJVzUGwq3KXkBelai6tykQ33fl.jpg',
 'production_companies': [{'id': 29729,
   'logo_path': None,
   'name': 'Famous Players-Lasky Corporation',
   'origin_country': 'US'},
  {'id': 4,
   'logo_path': '/fycMZt242LVjagMByZOLUGbCvv3.png',
   'name': 'Paramount',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1924-12-29',
 'revenue': 0,
 'runti

In [77]:
def get_movie_info(movie_id, keys=[]):
    conn.request("GET", "/3/movie/"+movie_id+"?api_key="+api_key, payload, headers)
    res = conn.getresponse()
    data = res.read()
    data_json = data.decode("utf-8")
    dict_ = json.loads(data_json)
    if 'success' in dict_:
        if not dict_['success']:
            dict_ = {key: None for key in keys}
            dict_['imdb_id'] = movie_id
    return dict_

get_movie_info('tt0015224')

{'adult': False,
 'backdrop_path': '/43vkdSbAd3n8c0RyOi8KR9UTEoY.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 10751, 'name': 'Family'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 14, 'name': 'Fantasy'}],
 'homepage': '',
 'id': 120672,
 'imdb_id': 'tt0015224',
 'original_language': 'en',
 'original_title': 'Peter Pan',
 'overview': 'Peter Pan enters the nursery of the Darling children and, with the help of fairy dust, leads them off to Never Never Land, where they meet the nefarious Captain Hook.',
 'popularity': 2.732,
 'poster_path': '/3JJVzUGwq3KXkBelai6tykQ33fl.jpg',
 'production_companies': [{'id': 29729,
   'logo_path': None,
   'name': 'Famous Players-Lasky Corporation',
   'origin_country': 'US'},
  {'id': 4,
   'logo_path': '/fycMZt242LVjagMByZOLUGbCvv3.png',
   'name': 'Paramount',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1924-12-29',
 'revenue': 0,
 'runti

In [95]:
info_dict = {}
count = 0
keys = list(get_movie_info('tt0015224').keys())
for row in test_movies_df.iterrows():
    movie_id = row[1]['tconst']
    info_dict[movie_id] = get_movie_info(movie_id, keys)
    count +=1 
    if count%10 == 0:
        print(count)


10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
1010
1020
1030
1040
1050
1060
1070
1080


In [97]:
info_dict_test = info_dict

In [98]:
# Save the raw data as json
json_object = json.dumps(info_dict_test)
with open("info_movies_test_json_all.json", "w") as outfile:
    outfile.write(json_object)

In [99]:
# Read the downloaded json file
file_name = 'info_movies_test_json_all.json'

with open(file_name) as json_file:
    movies_info = json.load(json_file)

In [100]:
# Convert to dataframe:
list_movies_info = [val for val in movies_info.values()]
df_movies_info = pd.DataFrame(list_movies_info)

# Delete empty rows:
# df_movies_info.dropna(subset=['imdb_id'], inplace=True)
# Set imdb_id as index:
# df_movies_info = df_movies_info.set_index('imdb_id')
df_movies_info.shape

(1086, 25)

In [85]:
df_movies_info

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,/kv7Cnk2FZzFzdbTMVPjMrOhSJdH.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,5153.0,tt0003740,it,Cabiria,...,1914-06-01,0.0,127.0,"[{'english_name': 'No Language', 'iso_639_1': ...",Released,All Nations Bow to This - The Greatest Spectac...,Cabiria,False,7.2,101.0
1,False,/h7h6UyaEOOT28JDUcvNMrp6xUma.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,108017.0,tt0008663,sv,Terje Vigen,...,1917-01-29,0.0,56.0,"[{'english_name': 'No Language', 'iso_639_1': ...",Released,,A Man There Was,False,7.1,49.0
2,False,/djSHA8FwFvuEvLHRvukSB8XmPPU.jpg,,0.0,"[{'id': 10752, 'name': 'War'}, {'id': 18, 'nam...",,70804.0,tt0010307,fr,J'accuse,...,1919-04-25,0.0,165.0,"[{'english_name': 'No Language', 'iso_639_1': ...",Released,,I Accuse,False,7.5,34.0
3,False,/oC6EnuxAE4ZR15rUPqV5nFNCLLB.jpg,,121000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,22596.0,tt0014429,en,Safety Last!,...,1923-04-01,1500000.0,74.0,"[{'english_name': 'No Language', 'iso_639_1': ...",Released,"You're Going to Explode With ""Safety Laughs"" w...",Safety Last!,False,7.9,334.0
4,False,/s7U6UG12M5K3XPuMCybt7cMRDzm.jpg,"{'id': 158038, 'name': 'Die Nibelungen Collect...",0.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",,31506.0,tt0015175,de,Die Nibelungen: Siegfried,...,1924-02-14,0.0,148.0,"[{'english_name': 'No Language', 'iso_639_1': ...",Released,,Die Nibelungen: Siegfried,False,7.7,89.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
950,False,/bK2NadJBzYBLr0JZp5m7l1hU9Do.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,617710.0,tt9686154,ar,You will die at twenty,...,2020-02-12,0.0,103.0,"[{'english_name': 'Arabic', 'iso_639_1': 'ar',...",Released,,You Will Die at Twenty,False,7.3,28.0
951,False,/elZrfwQUB3PEKjhQtRp7cCu8WKp.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,750272.0,tt9690328,en,Paper Spiders,...,2021-05-07,0.0,109.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Paper Spiders,False,7.0,10.0
952,False,/spNaUwpTBIAv7zIv5bVaXVJMJGL.jpg,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,706646.0,tt9735790,en,Me You Madness,...,2021-02-12,0.0,97.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,She's making a killing,Me You Madness,False,6.1,10.0
953,False,/gAv25JelDgr0w2P1q6gcmJ6MSzh.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,620637.0,tt9769668,ta,துக்ளக் தர்பார்,...,2021-09-10,0.0,146.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Tughlaq Durbar,False,6.9,7.0


In [101]:
# GET GENRES:
def get_genres(list_genres):
    try:
        list_processed = [gen['name'] for gen in list_genres]
        return list_processed
    except:
        return list_genres

df_movies_info['genre_list'] = df_movies_info['genres'].apply((lambda x: get_genres(x)))

In [102]:
# GET PRODUCTION COMPANIES
def get_production_names(list_production):
    try:
        list_processed = [prod['name'] for prod in list_production]
        return list_processed
    except:
        return list_production

df_movies_info['production_list'] = df_movies_info['production_companies'].apply((lambda x: get_production_names(x)))

In [103]:
# GET PRODUCTION COUNTRIES
def get_production_countries(list_production):
    try:
        list_processed = [prod['iso_3166_1'] for prod in list_production]
        return list_processed
    except:
        return list_production


df_movies_info['production_countr_list'] = df_movies_info['production_countries'].apply((lambda x: get_production_countries(x)))

In [104]:
# GET SPOKEN LANGUAGE:
def get_spoken_language(list_language):
    try:
        list_processed = [lan['iso_639_1'] for lan in list_language if lan['iso_639_1'] != 'xx' ]
        return list_processed
    except:
        return list_language
    
df_movies_info['spoken_language_list'] = df_movies_info['spoken_languages'].apply((lambda x: get_spoken_language(x)))

In [105]:
# REMOVE UNNECESSARY COLUMNS:
try:
    df_movies_info = df_movies_info.drop(columns=['success', 'status_code', 'status_message'])
except:
    pass
df_movies_info = df_movies_info.drop(columns=['backdrop_path', 'poster_path', 'homepage', 'status'])

# REMOVE PROCESSED COLUMNS:
df_movies_info = df_movies_info.drop(columns=['production_countries', 'spoken_languages', 'genres'])

In [93]:
df_movies_info.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,id,imdb_id,original_language,original_title,overview,popularity,production_companies,...,runtime,tagline,title,video,vote_average,vote_count,genre_list,production_list,production_countr_list,spoken_language_list
0,False,,0.0,5153.0,tt0003740,it,Cabiria,Young Cabiria is kidnapped by pirates and sold...,7.108,"[{'id': 14962, 'logo_path': None, 'name': 'Ita...",...,127.0,All Nations Bow to This - The Greatest Spectac...,Cabiria,False,7.2,101.0,"[Drama, History]",[Italia Film],[IT],[]
1,False,,0.0,108017.0,tt0008663,sv,Terje Vigen,"Terje Vigen, a sailor, suffers the loss of his...",4.073,"[{'id': 17620, 'logo_path': '/e1EieGeoDpBCx8Qd...",...,56.0,,A Man There Was,False,7.1,49.0,[Drama],[Svenska Biografteatern],[SE],[]
2,False,,0.0,70804.0,tt0010307,fr,J'accuse,"The story of two men, one married, the other t...",4.24,"[{'id': 21594, 'logo_path': None, 'name': 'Pat...",...,165.0,,I Accuse,False,7.5,34.0,"[War, Drama, History, Romance]",[Pathé Frères],[FR],[]
3,False,,121000.0,22596.0,tt0014429,en,Safety Last!,When a store clerk organizes a contest to clim...,12.053,"[{'id': 2159, 'logo_path': None, 'name': 'Hal ...",...,74.0,"You're Going to Explode With ""Safety Laughs"" w...",Safety Last!,False,7.9,334.0,"[Comedy, Romance, Action]",[Hal Roach Studios],[US],[]
4,False,"{'id': 158038, 'name': 'Die Nibelungen Collect...",0.0,31506.0,tt0015175,de,Die Nibelungen: Siegfried,"Siegfried, son of King Siegmund of Xanten, tra...",9.257,"[{'id': 12372, 'logo_path': None, 'name': 'UFA...",...,148.0,,Die Nibelungen: Siegfried,False,7.7,89.0,"[Adventure, Drama, Fantasy]","[UFA, Decla-Bioscop]",[DE],[]


In [106]:
df_movies_info.to_csv('movies_info_test.csv', sep = ';')  

## Errors checking

In [121]:
df_movies_info = pd.read_csv('movies_info_test.csv', sep = ';')

In [122]:
df_movies_info.shape

(1086, 23)

In [123]:
extra_df = set(df_movies_info['imdb_id'])
data_df = set(test_movies_df['tconst'])
missing_rows = data_df - extra_df
missing_rows
# missing_df = train_movies_df[train_movies_df['tconst'].isin(missing_rows)]

set()

In [109]:
missing_df.head(5)

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label


# Influence of startYear

In [None]:
# train_moviesa
movies_true = train_movies_df[train_movies_df['label']==True]
movies_false = train_movies_df[train_movies_df['label']==False]

count_perYear_true = movies_true.groupby(by="startYear").count()[['label']]
count_perYear_true = count_perYear_true.rename(columns={"label": "True"})
count_perYear_false = movies_false.groupby(by="startYear").count()[['label']]
count_perYear_false = count_perYear_false.rename(columns={"label": "False"})

result = pd.concat([count_perYear_true, count_perYear_false], axis=1)
result = result.fillna(0)
result['Total'] = result['True'] + result['False']
result['RatioTrue'] = (result['True'] / result['Total'])*100

result = result.drop(['\\N'], axis=0)
result.index = result.index.map(int) 
result = result.sort_index()

result['RatioTrue'].plot.line(legend=True)
result['Total'].plot.line(legend=True)

# MLlib

## Gradient Boosting Trees

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("iris_scale.txt")
data

In [None]:

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)  # summary only

## One vs rest

In [None]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# load data file.
inputData = spark.read.format("libsvm") \
    .load("data/mllib/sample_multiclass_classification_data.txt")

In [None]:
# generate the train/test split.
(train, test) = inputData.randomSplit([0.8, 0.2])

# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(train)

# score the model on test data.
predictions = ovrModel.transform(test)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))