# Datamart
This notebook aims to restrict data on the last 0 years movies and all linked data to this perimeter

In [1]:
import pandas as pd
import numpy as np
import os
import pyspark
import pyspark.sql.functions as F

In [2]:
dir_parsed_imdb = "../data/parsed_data/imdb.db"
dir_edited_datamart = "../data/edited_data/datamart.db"

In [4]:
if "sc" in locals():
    sc.stop()

In [3]:
sc = pyspark.SparkContext().getOrCreate()
spark = pyspark.sql.SparkSession(sc)

Exception: Java gateway process exited before sending its port number

#### perimeter

In [5]:
title_basics = spark.read.parquet(os.path.join(dir_parsed_imdb, "title_basics"))

In [6]:
perimeter = (
    title_basics
    .filter(title_basics.titleType == "movie")
    .filter(title_basics.startYear.between(2010, 2019))
    .select("tconst")
)

In [7]:
print(perimeter.count())

150783


In [8]:
perimeter.write.parquet(os.path.join(dir_edited_datamart, "perimeter"), mode="overwrite")

#### link.name.basics.title

In [9]:
perimeter = spark.read.parquet(os.path.join(dir_edited_datamart, "perimeter"))
link_name_basics_titles = spark.read.parquet(os.path.join(dir_parsed_imdb, "link_name_basics_titles"))

In [10]:
link_name_basics_titles = (
    link_name_basics_titles
    .join(perimeter.select("tconst"), ["tconst"], "inner")
    .select("nconst")
    .dropna()
    .dropDuplicates()
    .join(link_name_basics_titles, ["nconst"], "inner")
)

In [11]:
print(link_name_basics_titles.count())
link_name_basics_titles.show(5)

4696606
+---------+---------+
|   nconst|   tconst|
+---------+---------+
|nm0000198|tt4555426|
|nm0000198|tt1340800|
|nm0000198|tt0468569|
|nm0000198|tt0103874|
|nm0000354|tt3659388|
+---------+---------+
only showing top 5 rows



In [12]:
link_name_basics_titles.write.parquet(os.path.join(dir_edited_datamart, "link_name_basics_titles"), mode="overwrite")

#### tconst (movies identifier for the perimeter)

In [13]:
link_name_basics_titles = spark.read.parquet(os.path.join(dir_edited_datamart, "link_name_basics_titles"))
tconst = link_name_basics_titles.select("tconst").dropDuplicates()
print(tconst.count())

669916


In [14]:
tconst.write.parquet(os.path.join(dir_edited_datamart, "tconst"), mode="overwrite")

#### nconst (name identifier for the perimeter)

In [10]:
link_name_basics_titles = spark.read.parquet(os.path.join(dir_edited_datamart, "link_name_basics_titles"))
nconst = link_name_basics_titles.select("nconst").dropDuplicates()
print(nconst.count())

1959210


In [11]:
nconst.write.parquet(os.path.join(dir_edited_datamart, "nconst"), mode="overwrite")

#### title.basics

In [15]:
tcsont = spark.read.parquet(os.path.join(dir_edited_datamart, "tconst"))

In [16]:
title_basics = (
    title_basics
    .join(tconst, ["tconst"], "inner")
)

In [17]:
print(title_basics.count())
title_basics.show(5)

669562
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|        genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------+
|tt0010060|    movie|The Delicious Lit...|The Delicious Lit...|      0|     1919|   null|            63|  Comedy,Drama|
|tt0026930|    short|   Riders to the Sea|   Riders to the Sea|      0|     1935|   null|            40|         Short|
|tt0032968|    movie|The Ramparts We W...|The Ramparts We W...|      0|     1940|   null|            99|         Drama|
|tt0037961|    movie|        Oregon Trail|        Oregon Trail|      0|     1945|   null|            55|       Western|
|tt0041676|    movie|      Mickey Magnate|        Mágnás Miska|      0|     1949|   null|            95|Comedy,Musical|
+---------+---------+------------

In [18]:
title_basics.write.parquet(os.path.join(dir_edited_datamart, "title_basics"), mode="overwrite")

#### title.akas

In [19]:
tconst = spark.read.parquet(os.path.join(dir_edited_datamart, "tconst"))
title_akas = spark.read.parquet(os.path.join(dir_parsed_imdb, "title_akas"))

In [20]:
title_basics.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: integer (nullable = true)
 |-- startYear: integer (nullable = true)
 |-- endYear: integer (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [21]:
title_akas.printSchema()

root
 |-- titleId: string (nullable = true)
 |-- ordering: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- region: string (nullable = true)
 |-- language: string (nullable = true)
 |-- types: string (nullable = true)
 |-- attributes: string (nullable = true)
 |-- isOriginalTitle: integer (nullable = true)



In [22]:
title_akas = (
    title_akas
    .join(tconst.select(tconst.tconst.alias("titleId")), ["titleId"], "inner")
)

In [23]:
title_akas.write.parquet(os.path.join(dir_edited_datamart, "title_akas"), mode="overwrite")

#### title.ratings

In [24]:
tconst = spark.read.parquet(os.path.join(dir_edited_datamart, "tconst"))
title_ratings = spark.read.parquet(os.path.join(dir_parsed_imdb, "title_ratings"))

In [25]:
title_ratings = (
    title_ratings
    .join(tconst, ["tconst"], "inner")
)

In [26]:
title_ratings.write.parquet(os.path.join(dir_edited_datamart, "title_ratings"), mode="overwrite")

#### name.basics

In [12]:
nconst = spark.read.parquet(os.path.join(dir_edited_datamart, "nconst"))
name_basics = spark.read.parquet(os.path.join(dir_parsed_imdb, "name_basics"))

In [13]:
name_basics = (
    name_basics
    .join(nconst, ["nconst"], "inner")
)

In [14]:
print(name_basics.count())
name_basics.show(5)

1959210
+---------+-------------------+---------+---------+--------------------+--------------------+
|   nconst|        primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+-------------------+---------+---------+--------------------+--------------------+
|nm0000198|        Gary Oldman|     1958|     null|actor,soundtrack,...|tt4555426,tt13408...|
|nm0000354|         Matt Damon|     1970|     null|producer,actor,so...|tt3659388,tt01192...|
|nm0002222|Christopher Bradley|     1961|     null|actor,director,wr...|tt0109503,tt01047...|
|nm0002481|      Jim Greenhorn|     null|     null|    sound_department|tt1478964,tt01176...|
|nm0002941|       Kevin Loader|     1956|     null|producer,director...|tt1266029,tt12267...|
+---------+-------------------+---------+---------+--------------------+--------------------+
only showing top 5 rows



In [15]:
name_basics.write.parquet(os.path.join(dir_edited_datamart, "name_basics"), mode="overwrite")

#### title.crew 

In [27]:
tconst = pd.read_parquet("../data/parsed_data/tconst"))
title_crew = spark.read.parquet(os.path.join(dir_parsed_imdb, "title_crew"))

In [28]:
title_crew = (
    title_crew
    .join(tconst, ["tconst"], "inner")
)

In [29]:
title_crew.write.parquet(os.path.join(dir_edited_datamart, "title_crew"), mode="overwrite")

#### link.title.directors

In [30]:
tconst = spark.read.parquet(os.path.join(dir_edited_datamart, "tconst"))
link_title_directors = spark.read.parquet(os.path.join(dir_parsed_imdb, "link_title_directors"))

In [31]:
link_title_directors = (
    link_title_directors
    .join(tconst, ["tconst"], "inner")
)

In [32]:
link_title_directors.write.parquet(os.path.join(dir_edited_datamart, "link_title_directors"), mode="overwrite")

#### title.principals

In [4]:
title_principals = pd.read_parquet("../data/parsed_data/imdb.db/title_principals")
tconst = pd.read_parquet("../data/edited_data/datamart.db/tconst")

In [5]:
title_principals.count()

tconst        35314063
ordering      35314063
nconst        35314063
category      35314063
job            5818858
characters    17998270
dtype: int64

In [6]:
title_principals = title_principals.set_index("tconst").join(tconst.set_index("tconst"), how="inner")

In [7]:
print(title_principals.count())
title_principals.head()

ordering      4947376
nconst        4947376
category      4947376
job            690475
characters    2321269
dtype: int64


Unnamed: 0_level_0,ordering,nconst,category,job,characters
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tt0000038,1,nm2960633,self,,"[""Himself - First baseman""]"
tt0000038,2,nm0340719,self,,"[""Himself""]"
tt0000038,3,nm2958402,self,,"[""Himself""]"
tt0000038,4,nm0374658,director,,
tt0000417,1,nm0617588,actor,,"[""Prof. Barbenfouillis"",""The Moon""]"


In [9]:
title_principals.to_parquet("../data/edited_data/datamart.db/title_principals")