In [1]:
# IMPORT ALL NECESSARY PACKAGES

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
import pandas as pd
import numpy as np
import json

In [2]:
# READ THE TRAIN/TEST/VAL DATASETS:

from os import listdir
from os.path import isfile, join

path = "../data/"
files = [f for f in listdir(path) if isfile(join(path, f))]

train_movies_df = None
for f in files:
    if 'train' in f:
        print('Reading train file...', f)
        try:
            df = pd.read_csv(path + "/" + f)
        except:
            df = pd.read_csv(path + "/" + f, sep=';')
        try:
            train_movies_df = pd.concat([train_movies_df, df])
        except:
            # First file
            train_movies_df = df
            
    if 'test' in f:
        print('Reading test file...', f)
        test_movies_df = pd.read_csv(path + "/" + f)
    
    if 'validation' in f:
        print('Reading validation file...', f)
        validation_movies_df = pd.read_csv(path + "/" + f)
        
print('Train size:', train_movies_df.shape[0])
display(train_movies_df.head(10))
display(train_movies_df.info())

Reading train file... movies_info_train.csv
Reading test file... test_hidden.csv
Reading train file... train-1.csv
Reading train file... train-2.csv
Reading train file... train-3.csv
Reading train file... train-4.csv
Reading train file... train-5.csv
Reading train file... train-6.csv
Reading train file... train-7.csv
Reading train file... train-8.csv
Reading validation file... validation_hidden.csv
Train size: 15883


Unnamed: 0.1,imdb_id,adult,belongs_to_collection,budget,id,original_language,original_title,overview,popularity,production_companies,...,spoken_language_list,Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
0,tt0010600,False,,0.0,48256.0,de,Die Puppe,The misadventures of an effete young man who m...,4.861,"[{'id': 12950, 'logo_path': None, 'name': 'Pro...",...,[],,,,,,,,,
1,tt0011841,False,,0.0,31509.0,en,Way Down East,A naive country girl is tricked into a sham ma...,7.617,"[{'id': 4759, 'logo_path': None, 'name': 'D.W....",...,[],,,,,,,,,
2,tt0012494,False,,0.0,29267.0,de,Der müde Tod,As a young couple stops and rests in a small v...,7.593,"[{'id': 6762, 'logo_path': None, 'name': 'Decl...",...,['de'],,,,,,,,,
3,tt0015163,False,,0.0,32318.0,en,The Navigator,The wealthy and impulsive Rollo Treadway decid...,7.881,"[{'id': 12190, 'logo_path': None, 'name': 'Bus...",...,[],,,,,,,,,
4,tt0016220,False,,0.0,964.0,en,The Phantom of the Opera,"A grotesquely disfigured composer known as ""Th...",12.83,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,[],,,,,,,,,
5,tt0016630,False,,0.0,51371.0,en,Battling Butler,Meek millionaire Alfred Butler goes on a campi...,7.361,"[{'id': 12190, 'logo_path': None, 'name': 'Bus...",...,[],,,,,,,,,
6,tt0021015,False,,0.0,47695.0,en,Juno and the Paycock,"During the Irish revolution, a family earns a ...",4.232,"[{'id': 305, 'logo_path': None, 'name': 'Briti...",...,['en'],,,,,,,,,
7,tt0023973,False,,0.0,81110.0,en,The Eagle and the Hawk,The pilots of a Royal Air Force squadron in Wo...,3.45,"[{'id': 4, 'logo_path': '/fycMZt242LVjagMByZOL...",...,['en'],,,,,,,,,
8,tt0023986,False,,0.0,43599.0,en,Employees' Entrance,Kurt Anderson is the tyrannical manager of a N...,4.13,"[{'id': 3245, 'logo_path': '/9dBTQp9XitrHkx20i...",...,['en'],,,,,,,,,
9,tt0024184,False,"{'id': 259401, 'name': 'The Invisible Man Coll...",328000.0,10787.0,en,The Invisible Man,"Working in Dr. Cranley’s laboratory, scientist...",16.478,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKX...",...,[],,,,,,,,,


<class 'pandas.core.frame.DataFrame'>
Int64Index: 15883 entries, 0 to 992
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   imdb_id                 7924 non-null   object 
 1   adult                   7924 non-null   object 
 2   belongs_to_collection   1082 non-null   object 
 3   budget                  7924 non-null   float64
 4   id                      7924 non-null   float64
 5   original_language       7924 non-null   object 
 6   original_title          7924 non-null   object 
 7   overview                7873 non-null   object 
 8   popularity              7924 non-null   float64
 9   production_companies    7924 non-null   object 
 10  release_date            7916 non-null   object 
 11  revenue                 7924 non-null   float64
 12  runtime                 7921 non-null   float64
 13  tagline                 4637 non-null   object 
 14  title                   7924 non-null   

None

In [3]:
df = pd.read_csv("../data/train-1.csv")
len(df)
df[df['tconst']=='tt0023986']

Unnamed: 0.1,Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,numVotes,label
8,119,tt0023986,Émplớyéés' Éntráncé,,1933,\N,75,,True


In [4]:
# GET THE WRITINGS INFORMATION:

file = f = open('../data/writing.json')
list_writings = json.load(file)

# list_2df = [[dict_['movie'], dict_['writer']] for dict_ in list_writings]
# df_writings = pd.DataFrame(list_2df, columns=['movie', 'writer'])
df_writings = pd.DataFrame(list_writings, columns=['movie', 'writer'])

print('Writings size:', df_writings.size)
print('Unique writers:', len(df_writings['writer'].unique()))
display(df_writings.head(5))

Writings size: 44856
Unique writers: 15248


Unnamed: 0,movie,writer
0,tt0003740,nm0195339
1,tt0003740,nm0515385
2,tt0003740,nm0665163
3,tt0003740,nm0758215
4,tt0008663,nm0406585


In [5]:
# A BIT OF EDA:
print('Votes mean:', train_movies_df['numVotes'].mean())
print('Votes median:', train_movies_df['numVotes'].median())
print('Votes min/max:', train_movies_df['numVotes'].min(), train_movies_df['numVotes'].max())
print('---')

train_movies_df.astype({'startYear': 'int32', 'endYear': 'int32'}, errors='ignore').dtypes


print('Min/max start year:', train_movies_df['startYear'].min(), train_movies_df['startYear'].max())
print('Min/max end year:', train_movies_df['endYear'].min(), train_movies_df['startYear'].max())

Votes mean: 29520.51081043381
Votes median: 3559.0
Votes min/max: 1001.0 2503641.0
---


TypeError: '<=' not supported between instances of 'float' and 'str'

In [6]:
# train_movies_df[train_movies_df['endYear']=='\\N'].apply(train_movies_df['startYear'])
train_movies_df.startYear.replace('\\N',df.endYear,inplace=True)

ValueError: Series.replace cannot use dict-value and non-None to_replace

In [7]:
train_movies_df['endYear']

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
988     \N
989     \N
990     \N
991     \N
992     \N
Name: endYear, Length: 15883, dtype: object

In [None]:
def check_int(value):
    try:
        int(value)
        return int(value)
    except ValueError:
        return 0
train_movies_df['endYear'] = train_movies_df['endYear'].apply(check_int)

# Multi-label classification for genre extraction

https://imdb-api.com/en/API/Title/k_2qlsgnp9/tt0015224

['id', 'title', 'originalTitle', 'fullTitle', 'type', 'year', 'image', 'releaseDate', 'runtimeMins', 'runtimeStr', 'plot', 'plotLocal', 'plotLocalIsRtl', 'awards', 'directors', 'directorList', 'writers', 'writerList', 'stars', 'starList', 'actorList', 'fullCast', 'genres', 'genreList', 'companies', 'companyList', 'countries', 'countryList', 'languages', 'languageList', 'contentRating', 'imDbRating', 'imDbRatingVotes', 'metacriticRating', 'ratings', 'wikipedia', 'posters', 'images', 'trailer', 'boxOffice', 'tagline', 'keywords', 'keywordList', 'similars', 'tvSeriesInfo', 'tvEpisodeInfo', 'errorMessage']

In [None]:
import http.client
import mimetypes
import json

In [None]:
conn = http.client.HTTPSConnection("imdb-api.com", 443)
api_key = 'k_2qlsgnp9'
payload = ''
headers = {}

In [None]:
def get_movie_info(movie_id):
    conn.request("GET", "/en/API/Title/"+api_key+"/"+movie_id, payload, headers)
    res = conn.getresponse()
    data = res.read()
    data_json = data.decode("utf-8")
    return json.loads(data_json)

def get_genres(movie_id):
    try:
        info = get_movie_info(movie_id)
        return info['genres']
    except Exception as e:
        print(e.message, e.args)
        return e.args

In [None]:
movie_id = 'tt0015224' # Example: Peter Pan
get_genres(movie_id)

In [None]:
movies_wGenre = train_movies_df
movies_wGenre['genre'] = movies_wGenre['tconst'].apply((lambda x: get_genres(x)))
movies_wGenre.head(5)

In [None]:
display(movies_wGenre.shape[0])
display(movies_wGenre[pd.isna(movies_wGenre['genre'])].shape[0])

In [None]:
list_genres = movies_wGenre[[not i for i in pd.isna(movies_wGenre['genre'])]]['genre']
gens = [gen.strip() for movie in list_genres for gen in movie.split(',')]
print('Num of genres: ', len(set(gens)))
set(gens)

## Genre from another API

https://api.themoviedb.org/3/genre/movie/list?api_key=40f5afa6d3b4da3ef1e32a4e4bb3fe20

In [8]:
import http.client
import mimetypes
import json

In [9]:
conn = http.client.HTTPSConnection("api.themoviedb.org")
api_key = '40f5afa6d3b4da3ef1e32a4e4bb3fe20'
payload = ''
headers = {
    'Content-Type': 'application/json;charset=utf-8',
    'Authorization': 'Bearer <<access_token>>'
}

In [28]:
conn.request("GET", "/3/movie/tt0015224?api_key="+api_key, payload, headers)
res = conn.getresponse()
data = res.read()
data_json = data.decode("utf-8")
json.loads(data_json)

{'adult': False,
 'backdrop_path': '/43vkdSbAd3n8c0RyOi8KR9UTEoY.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 10751, 'name': 'Family'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 14, 'name': 'Fantasy'}],
 'homepage': '',
 'id': 120672,
 'imdb_id': 'tt0015224',
 'original_language': 'en',
 'original_title': 'Peter Pan',
 'overview': 'Peter Pan enters the nursery of the Darling children and, with the help of fairy dust, leads them off to Never Never Land, where they meet the nefarious Captain Hook.',
 'popularity': 2.732,
 'poster_path': '/3JJVzUGwq3KXkBelai6tykQ33fl.jpg',
 'production_companies': [{'id': 29729,
   'logo_path': None,
   'name': 'Famous Players-Lasky Corporation',
   'origin_country': 'US'},
  {'id': 4,
   'logo_path': '/fycMZt242LVjagMByZOLUGbCvv3.png',
   'name': 'Paramount',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1924-12-29',
 'revenue': 0,
 'runti

In [29]:
def get_movie_info(movie_id):
    conn.request("GET", "/3/movie/"+movie_id+"?api_key="+api_key, payload, headers)
    res = conn.getresponse()
    data = res.read()
    data_json = data.decode("utf-8")
    return json.loads(data_json)

get_movie_info('tt0015224')

{'adult': False,
 'backdrop_path': '/43vkdSbAd3n8c0RyOi8KR9UTEoY.jpg',
 'belongs_to_collection': None,
 'budget': 0,
 'genres': [{'id': 10751, 'name': 'Family'},
  {'id': 12, 'name': 'Adventure'},
  {'id': 14, 'name': 'Fantasy'}],
 'homepage': '',
 'id': 120672,
 'imdb_id': 'tt0015224',
 'original_language': 'en',
 'original_title': 'Peter Pan',
 'overview': 'Peter Pan enters the nursery of the Darling children and, with the help of fairy dust, leads them off to Never Never Land, where they meet the nefarious Captain Hook.',
 'popularity': 2.732,
 'poster_path': '/3JJVzUGwq3KXkBelai6tykQ33fl.jpg',
 'production_companies': [{'id': 29729,
   'logo_path': None,
   'name': 'Famous Players-Lasky Corporation',
   'origin_country': 'US'},
  {'id': 4,
   'logo_path': '/fycMZt242LVjagMByZOLUGbCvv3.png',
   'name': 'Paramount',
   'origin_country': 'US'}],
 'production_countries': [{'iso_3166_1': 'US',
   'name': 'United States of America'}],
 'release_date': '1924-12-29',
 'revenue': 0,
 'runti

In [30]:
info_dict = {}
count = 0
for row in validation_movies_df.iterrows():
    movie_id = row[1]['tconst']
    info_dict[movie_id] = get_movie_info(movie_id)
    count +=1 
    if count%10 == 0:
        print(count)
info_dict

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950


{'tt0003740': {'adult': False,
  'backdrop_path': '/kv7Cnk2FZzFzdbTMVPjMrOhSJdH.jpg',
  'belongs_to_collection': None,
  'budget': 0,
  'genres': [{'id': 18, 'name': 'Drama'}, {'id': 36, 'name': 'History'}],
  'homepage': '',
  'id': 5153,
  'imdb_id': 'tt0003740',
  'original_language': 'it',
  'original_title': 'Cabiria',
  'overview': "Young Cabiria is kidnapped by pirates and sold as a slave in Carthage. Just as she's to be sacrificed to Moloch, Cabiria is rescued by Fulvius Axilla, a good-hearted Roman spy, and his powerful slave, Maciste. The trio are broken up as Cabiria is entrusted to a woman of noble birth. With Cabiria's fate unknown, Maciste punished for his heroism, and Fulvius sent away to fight for Rome, is there any hope of our heroes reuniting?",
  'popularity': 7.108,
  'poster_path': '/oZhgcHHapkzYmB0OhQcRfZ7LdLM.jpg',
  'production_companies': [{'id': 14962,
    'logo_path': None,
    'name': 'Italia Film',
    'origin_country': ''}],
  'production_countries': [{'is

In [31]:
info_dict_validation = info_dict

In [32]:
# Save the raw data as json
json_object = json.dumps(info_dict_validation)
with open("info_movies_val_json2.json", "w") as outfile:
    outfile.write(json_object)

In [33]:
# Read the downloaded json file
file_name = 'info_movies_val_json2.json'

with open(file_name) as json_file:
    movies_info = json.load(json_file)

In [35]:
# Convert to dataframe:
list_movies_info = [val for val in movies_info.values()]
df_movies_info = pd.DataFrame(list_movies_info)

# Delete empty rows:
df_movies_info.dropna(subset=['imdb_id'], inplace=True)
# Set imdb_id as index:
df_movies_info = df_movies_info.set_index('imdb_id')
df_movies_info.shape

(948, 27)

In [36]:
# GET GENRES:
def get_genres(list_genres):
    try:
        list_processed = [gen['name'] for gen in list_genres]
        return list_processed
    except:
        return list_genres

df_movies_info['genre_list'] = df_movies_info['genres'].apply((lambda x: get_genres(x)))

In [37]:
# GET PRODUCTION COMPANIES
def get_production_names(list_production):
    try:
        list_processed = [prod['name'] for prod in list_production]
        return list_processed
    except:
        return list_production

df_movies_info['production_list'] = df_movies_info['production_companies'].apply((lambda x: get_production_names(x)))

In [38]:
# GET PRODUCTION COUNTRIES
def get_production_countries(list_production):
    try:
        list_processed = [prod['iso_3166_1'] for prod in list_production]
        return list_processed
    except:
        return list_production


df_movies_info['production_countr_list'] = df_movies_info['production_countries'].apply((lambda x: get_production_countries(x)))

In [39]:
# GET SPOKEN LANGUAGE:
def get_spoken_language(list_language):
    try:
        list_processed = [lan['iso_639_1'] for lan in list_language if lan['iso_639_1'] != 'xx' ]
        return list_processed
    except:
        return list_production
    
df_movies_info['spoken_language_list'] = df_movies_info['spoken_languages'].apply((lambda x: get_spoken_language(x)))

In [40]:
# REMOVE UNNECESSARY COLUMNS:
df_movies_info = df_movies_info.drop(columns=['success', 'status_code', 'status_message'])
df_movies_info = df_movies_info.drop(columns=['backdrop_path', 'poster_path', 'homepage', 'status'])

# REMOVE PROCESSED COLUMNS:
df_movies_info = df_movies_info.drop(columns=['production_countries', 'spoken_languages', 'genres'])

In [41]:
df_movies_info.head(5)

Unnamed: 0_level_0,adult,belongs_to_collection,budget,id,original_language,original_title,overview,popularity,production_companies,release_date,...,runtime,tagline,title,video,vote_average,vote_count,genre_list,production_list,production_countr_list,spoken_language_list
imdb_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0003740,False,,0.0,5153.0,it,Cabiria,Young Cabiria is kidnapped by pirates and sold...,7.108,"[{'id': 14962, 'logo_path': None, 'name': 'Ita...",1914-06-01,...,127.0,All Nations Bow to This - The Greatest Spectac...,Cabiria,False,7.2,101.0,"[Drama, History]",[Italia Film],[IT],[]
tt0008663,False,,0.0,108017.0,sv,Terje Vigen,"Terje Vigen, a sailor, suffers the loss of his...",4.073,"[{'id': 17620, 'logo_path': '/e1EieGeoDpBCx8Qd...",1917-01-29,...,56.0,,A Man There Was,False,7.1,49.0,[Drama],[Svenska Biografteatern],[SE],[]
tt0010307,False,,0.0,70804.0,fr,J'accuse,"The story of two men, one married, the other t...",4.24,"[{'id': 21594, 'logo_path': None, 'name': 'Pat...",1919-04-25,...,165.0,,I Accuse,False,7.5,34.0,"[War, Drama, History, Romance]",[Pathé Frères],[FR],[]
tt0014429,False,,121000.0,22596.0,en,Safety Last!,When a store clerk organizes a contest to clim...,12.053,"[{'id': 2159, 'logo_path': None, 'name': 'Hal ...",1923-04-01,...,74.0,"You're Going to Explode With ""Safety Laughs"" w...",Safety Last!,False,7.9,334.0,"[Comedy, Romance, Action]",[Hal Roach Studios],[US],[]
tt0015175,False,"{'id': 158038, 'name': 'Die Nibelungen Collect...",0.0,31506.0,de,Die Nibelungen: Siegfried,"Siegfried, son of King Siegmund of Xanten, tra...",9.257,"[{'id': 12372, 'logo_path': None, 'name': 'UFA...",1924-02-14,...,148.0,,Die Nibelungen: Siegfried,False,7.7,89.0,"[Adventure, Drama, Fantasy]","[UFA, Decla-Bioscop]",[DE],[]


In [42]:
df_movies_info.to_csv('movies_info_val.csv', sep = ';')  

# Influence of startYear

In [None]:
# train_moviesa
movies_true = train_movies_df[train_movies_df['label']==True]
movies_false = train_movies_df[train_movies_df['label']==False]

count_perYear_true = movies_true.groupby(by="startYear").count()[['label']]
count_perYear_true = count_perYear_true.rename(columns={"label": "True"})
count_perYear_false = movies_false.groupby(by="startYear").count()[['label']]
count_perYear_false = count_perYear_false.rename(columns={"label": "False"})

result = pd.concat([count_perYear_true, count_perYear_false], axis=1)
result = result.fillna(0)
result['Total'] = result['True'] + result['False']
result['RatioTrue'] = (result['True'] / result['Total'])*100

result = result.drop(['\\N'], axis=0)
result.index = result.index.map(int) 
result = result.sort_index()

result['RatioTrue'].plot.line(legend=True)
result['Total'].plot.line(legend=True)

# MLlib

## Gradient Boosting Trees

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("libsvm").load("iris_scale.txt")
data

In [None]:

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(data)

# Split the data into training and test sets (30% held out for testing)
(trainingData, testData) = data.randomSplit([0.7, 0.3])

# Train a GBT model.
gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures", maxIter=10)

# Chain indexers and GBT in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "indexedLabel", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))

gbtModel = model.stages[2]
print(gbtModel)  # summary only

## One vs rest

In [None]:
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
# load data file.
inputData = spark.read.format("libsvm") \
    .load("data/mllib/sample_multiclass_classification_data.txt")

In [None]:
# generate the train/test split.
(train, test) = inputData.randomSplit([0.8, 0.2])

# instantiate the base classifier.
lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True)

# instantiate the One Vs Rest Classifier.
ovr = OneVsRest(classifier=lr)

# train the multiclass model.
ovrModel = ovr.fit(train)

# score the model on test data.
predictions = ovrModel.transform(test)

# obtain evaluator.
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")

# compute the classification error on test data.
accuracy = evaluator.evaluate(predictions)
print("Test Error = %g" % (1.0 - accuracy))