In [9]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
import pyspark.sql.types as t

<details><summary>Click here for the dataset description</summary>

## **Dataset Description (IMDb dataset)**

### **Files**
* **name.basics.tsv** - 
* **title.akas.tsv** - 
* **title.basics.tsv** - 
* **title.crew.tsv** - 
* **title.episode.tsv** - 
* **title.principals.tsv** - 
* **title.ratings.tsv** - 

### **name.basics.tsv**

|      Name         |    Type          |   Description                                     |
| -------------     | ---------------- | ------------------------------------------------- |
|     nconst        |   string         | alphanumeric unique identifier of the name/person |
|   primaryName     | string           | name by which the person is most often credited   |
|     birthYear     | in YYYY format   | birth year                                        |
|     deathYear     | in YYYY format   | if applicable, else '\N'                          |
| primaryProfession | array of strings | the top-3 professions of the person               |
|   knownForTitles  | array of tconsts | titles the person is known for                    |

### **title.akas.tsv**

|      Name       |    Type   |   Description    |
| -------------   | --------- | ---------------- |
| titleId         | string    | a tconst, an alphanumeric unique identifier of the title |
| ordering        | integer   | a number to uniquely identify rows for a given titleId |
| title           | string    | the localized title |
| region          | string    | the region for this version of the title |
| language        | string    | the language of the title |
| types           | array     | Enumerated set of attributes for this alternative title. One or more of the following: "alternative", "dvd", "festival", "tv", "video", "working", "original", "imdbDisplay". New values may be added in the future without warning |
| attributes      | array     | Additional terms to describe this alternative title, not enumerated |
| isOriginalTitle | boolean   | 0: not original title; 1: original title |

### **title.basics.tsv**

|      Name      |    Type      |   Description   |
| -------------  | ------------ | ----------------|
| tconst         | string       | alphanumeric unique identifier of the title |
| titleType      | string       | the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc) |
| primaryTitle   | string       | the more popular title / the title used by the filmmakers on promotional materials at the point of release |
| originalTitle  | string       | original title, in the original language |
| isAdult        | boolean      | 0: non-adult title; 1: adult title |
| startYear      | YYYY         | represents the release year of a title. In the case of TV Series, it is the series start year |
| endYear        | YYYY         | TV Series end year. ‘\N’ for all other title types |
| runtimeMinutes | integer      | primary runtime of the title, in minutes |
| genres         | string array | includes up to three genres associated with the title |

### **title.crew.tsv**

|      Name     |    Type          |   Description                               |
| ------------- | ---------------  | ------------------------------------------- |
| tconst        | string           | alphanumeric unique identifier of the title |
| directors     | array of nconsts | director(s) of the given title              |
| writers       | array of nconsts | writer(s) of the given title                |

### **title.episode.tsv**

|      Name     |    Type   |   Description                                   |
| ------------- | --------- | ----------------------------------------------- |
| tconst        | string    | alphanumeric identifier of episode              |
| parentTconst  | string    | alphanumeric identifier of the parent TV Series |
| seasonNumber  | integer   | season number the episode belongs to            |
| episodeNumber | integer   | episode number of the tconst in the TV series   |

### **title.principals.tsv**

|      Name     |    Type   |   Description                                             |
| ------------- | --------- | --------------------------------------------------------- |
| tconst        | string    | alphanumeric unique identifier of the title               |
| ordering      | integer   | a number to uniquely identify rows for a given titleId    |
| nconst        | string    | alphanumeric unique identifier of the name/person         |
| category      | string    | the category of job that person was in                    |
| job           | string    | the specific job title if applicable, else '\N'           |
| characters    | string    | the name of the character played if applicable, else '\N' |

### **title.ratings.tsv**

|      Name     |    Type   |   Description                                        |
| ------------- | --------- | ---------------------------------------------------- |
| tconst        | string    | alphanumeric unique identifier of the title          |
| averageRating | float     | weighted average of all the individual user ratings  |
| numVotes      |  integer  | number of votes the title has received               |
    
</details>

In [2]:
path = 'C:/files/grid-dynamics'

In [3]:
spark_session = (
    SparkSession.builder
        .master('local')
        .appName('project')
        .config(conf=SparkConf())
        .getOrCreate()
)

In [25]:
name_basics_schema = t.StructType([
    t.StructField('nconst', t.StringType(), True),
    t.StructField('primaryName', t.StringType(), True),
    t.StructField('birthYear', t.DateType(), True),
    t.StructField('deathYear', t.DateType(), True),
    t.StructField('primaryProfession', t.StringType(), True),
    t.StructField('knownForTitles', t.StringType(), True)
])

akas_schema = t.StructType([
    t.StructField('titleId', t.StringType(), True),
    t.StructField('ordering', t.IntegerType(), True),
    t.StructField('title', t.StringType(), True),
    t.StructField('region', t.StringType(), True),
    t.StructField('language', t.StringType(), True),
    t.StructField('types', t.StringType(), True),
    t.StructField('attributes', t.StringType(), True),
    t.StructField('isOriginalTitle', t.BooleanType(), True)
])

title_basics_schema = t.StructType([
    t.StructField('tconst', t.StringType(), True),
    t.StructField('titleType', t.StringType(), True),
    t.StructField('primaryTitle', t.StringType(), True),
    t.StructField('originalTitle', t.StringType(), True),
    t.StructField('isAdult', t.BooleanType(), True),
    t.StructField('startYear', t.DateType(), True),
    t.StructField('endYear', t.DateType(), True),
    t.StructField('runtimeMinutes', t.IntegerType(), True),
    t.StructField('genres', t.StringType(), True),
])

crew_schema = t.StructType([
    t.StructField('tconst', t.StringType(), True),
    t.StructField('directors', t.StringType(), True),
    t.StructField('writers', t.StringType(), True),
])

episode_schema = t.StructType([
    t.StructField('tconst', t.StringType(), True),
    t.StructField('parentTconst', t.StringType(), True),
    t.StructField('seasonNumber', t.IntegerType(), True),
    t.StructField('episodeNumber', t.IntegerType(), True),
])

principals_schema = t.StructType([
    t.StructField('tconst', t.StringType(), True),
    t.StructField('ordering', t.IntegerType(), True),
    t.StructField('nconst', t.StringType(), True),
    t.StructField('category', t.StringType(), True),
    t.StructField('job', t.StringType(), True),
    t.StructField('characters', t.StringType(), True)
])

ratings_schema = t.StructType([
    t.StructField('tconst', t.StringType(), True),
    t.StructField('averageRating', t.FloatType(), True),
    t.StructField('numVotes', t.IntegerType(), True)
])

In [31]:
name_basics = spark_session.read.csv(f'{path}/name.basics.tsv', 
                                    sep=r'\t', 
                                    header=True,
                                    nullValue='null',
                                    schema=name_basics_schema,
                                    dateFormat='yyyy')

akas = spark_session.read.csv(f'{path}/title.akas.tsv', 
                                    sep=r'\t', 
                                    header=True,
                                    nullValue='null',
                                    schema=akas_schema)

title_basics = spark_session.read.csv(f'{path}/title.basics.tsv', 
                                    sep=r'\t', 
                                    header=True,
                                    nullValue='null',
                                    schema=title_basics_schema,
                                    dateFormat='yyyy')

crew = spark_session.read.csv(f'{path}/title.crew.tsv', 
                                    sep=r'\t', 
                                    header=True,
                                    nullValue='null',
                                    schema=crew_schema)

episode = spark_session.read.csv(f'{path}/title.episode.tsv', 
                                    sep=r'\t', 
                                    header=True,
                                    nullValue='null',
                                    schema=episode_schema)

principals = spark_session.read.csv(f'{path}/title.principals.tsv', 
                                    sep=r'\t', 
                                    header=True,
                                    nullValue='null',
                                    schema=principals_schema)

ratings = spark_session.read.csv(f'{path}/title.ratings.tsv', 
                                    sep=r'\t', 
                                    header=True,
                                    nullValue='null',
                                    schema=ratings_schema)

In [32]:
principals.show()

+---------+--------+---------+---------------+--------------------+--------------+
|   tconst|ordering|   nconst|       category|                 job|    characters|
+---------+--------+---------+---------------+--------------------+--------------+
|tt0000001|       1|nm1588970|           self|                  \N|      ["Self"]|
|tt0000001|       2|nm0005690|       director|                  \N|            \N|
|tt0000001|       3|nm0374658|cinematographer|director of photo...|            \N|
|tt0000002|       1|nm0721526|       director|                  \N|            \N|
|tt0000002|       2|nm1335271|       composer|                  \N|            \N|
|tt0000003|       1|nm0721526|       director|                  \N|            \N|
|tt0000003|       2|nm1770680|       producer|            producer|            \N|
|tt0000003|       3|nm1335271|       composer|                  \N|            \N|
|tt0000003|       4|nm5442200|         editor|                  \N|            \N|
|tt0