# Przygotowanie danych

## Importy

In [1]:
import os
import re
import ast
import requests
import datetime

import pandas as pd
import numpy as np

from tqdm import tqdm
from uuid import uuid4
from flag import flag
from sqlalchemy import create_engine

## Stałe

In [2]:
MKDIR_DATA = 'data/'
MKDIR_IMAGE = 'image/'

MKDIR_MOVIES = os.path.join(MKDIR_IMAGE, 'movies')
MKDIR_COLLECTIONS = os.path.join(MKDIR_IMAGE, 'collections')
MKDIR_PEOPLE = os.path.join(MKDIR_IMAGE, 'people')

MOVIES_CSV = os.path.join(MKDIR_DATA, 'movies_metadata.csv')
KEYWORDS_CSV = os.path.join(MKDIR_DATA, 'keywords.csv')
CREDITS_CSV = os.path.join(MKDIR_DATA, 'credits.csv')
RATINGS_CSV = os.path.join(MKDIR_DATA,'ratings.csv')
LINKS_CSV = os.path.join(MKDIR_DATA, 'links.csv')


API_KEY = 'API KEY'
URL_API = 'https://api.themoviedb.org/3/'

URL_IMAGE = 'http://image.tmdb.org/t/p/original/'

MAX_FOLDER_IMAGE_GB = 0.1

DB_HOST = '127.0.0.1'
DB_NAME = 'webfilm'
DB_USER = 'webfilm'
DB_PASS = 'nhy6&UJM'

## Utworzenie folderów

In [3]:
list_folders = [
    MKDIR_DATA,
    MKDIR_IMAGE,
    MKDIR_MOVIES,
    MKDIR_COLLECTIONS,
    MKDIR_PEOPLE,
]
for x in list_folders:
    if not os.path.exists(x):
        os.makedirs(x)

## Ustawienia środowiska

In [4]:
PANDAS_MAX_ROW = 10

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', PANDAS_MAX_ROW)

## Pobranie danych

In [5]:
!kaggle datasets download -p $MKDIR_DATA --unzip rounakbanik/the-movies-dataset

file = open(MOVIES_CSV, 'r')
wiersz = 1
usunac_nowy_wiersz = [19763, 29572, 35671]
tekst = ''
for line in tqdm(file):
    if wiersz in usunac_nowy_wiersz:
        line = line.strip()

    wiersz = wiersz + 1
    tekst += line
file.close()

file = open(MOVIES_CSV, 'w')
file.write(tekst)
file.close()

Downloading the-movies-dataset.zip to data
 99%|███████████████████████████████████████▌| 225M/228M [00:08<00:00, 30.4MB/s]
100%|████████████████████████████████████████| 228M/228M [00:08<00:00, 27.5MB/s]


45573it [00:00, 111015.61it/s]


## Wczytanie danych
Wczytanie i podzielenie danych

### Movies

In [6]:
movies = pd.read_csv(MOVIES_CSV, keep_default_na=False)
movies = movies.drop_duplicates('id').sort_values('id')
movies = movies.rename(columns={
    'belongs_to_collection': 'collection_id',
    'original_language': 'original_language_id',
    'status': 'status_id',
})
movies = movies.drop('poster_path', axis=1)
movies = movies.drop('imdb_id', axis=1)
movies = movies.drop('popularity', axis=1)
movies = movies.drop('vote_average', axis=1)
movies = movies.drop('vote_count', axis=1)

movies.head()

Unnamed: 0,adult,collection_id,budget,genres,homepage,id,original_language_id,original_title,overview,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status_id,tagline,title,video
4342,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",,2,fi,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,"[{'name': 'Villealfa Filmproduction Oy', 'id':...","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",1988-10-21,0,69.0,"[{'iso_639_1': 'fi', 'name': 'suomi'}, {'iso_6...",Released,,Ariel,False
12947,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,3,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...","[{'name': 'Villealfa Filmproduction Oy', 'id':...","[{'iso_3166_1': 'FI', 'name': 'Finland'}]",1986-10-16,0,76.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,,Shadows in Paradise,False
17,False,,4000000,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",,5,en,Four Rooms,It's Ted the Bellhop's first night on the job....,"[{'name': 'Miramax Films', 'id': 14}, {'name':...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-09,4300000,98.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False
474,False,,0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,6,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...","[{'name': 'Universal Pictures', 'id': 33}, {'n...","[{'iso_3166_1': 'JP', 'name': 'Japan'}, {'iso_...",1993-10-15,12136938,110.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,False
256,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",11000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.starwars.com/films/star-wars-episod...,11,en,Star Wars,Princess Leia is captured and held hostage by ...,"[{'name': 'Lucasfilm', 'id': 1}, {'name': 'Twe...","[{'iso_3166_1': 'US', 'name': 'United States o...",1977-05-25,775398007,121.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"A long time ago in a galaxy far, far away...",Star Wars,False


### Collection

In [7]:
collection = []
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    if data['collection_id'] != '':
        tmp = ast.literal_eval(data['collection_id'])
        collection.append({
            'id': tmp['id'],
            'name': tmp['name'],
        })
        movies.at[label, 'collection_id'] = tmp['id']
    else:
        movies.at[label, 'collection_id'] = None

100%|███████████████████████████████████| 45433/45433 [00:09<00:00, 4567.53it/s]


In [8]:
collection = pd.DataFrame(collection).sort_values('id').drop_duplicates('name')
collection.head()

Unnamed: 0,id,name
0,10,Star Wars Collection
17,84,Indiana Jones Collection
28,119,The Lord of the Rings Collection
25,131,Three Colors Collection
37,151,Star Trek: The Original Series Collection


### Genres

In [9]:
genres = []
movies_genres = []
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    if data['genres'] != '':
        tmp = ast.literal_eval(data['genres'])
        for x in tmp:
            movies_genres.append({
                'genre_id': x['id'],
                'movie_id': data['id'],
            })
            genres.append(x)
movies = movies.drop('genres', axis=1)

100%|███████████████████████████████████| 45433/45433 [00:18<00:00, 2512.31it/s]


In [10]:
genres = pd.DataFrame(genres).sort_values('id').drop_duplicates('id')
genres.head()

Unnamed: 0,id,name
6784,12,Adventure
65263,14,Fantasy
37860,16,Animation
29936,18,Drama
19538,27,Horror


In [11]:
movies_genres = pd.DataFrame(movies_genres).drop_duplicates()
movies_genres.head()

Unnamed: 0,genre_id,movie_id
0,18,2
1,80,2
2,18,3
3,35,3
4,80,5


### Production companies

In [12]:
def check_name_in_list(list_dict, search_name):
    for x in list_dict:
        if x['name'] == search_name:
            return True, x
    return False, -1

In [13]:
production_companies = []
movies_companies = []
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    if data['production_companies'] != '':
        tmp = ast.literal_eval(data['production_companies'])
        for x in tmp:
            found, position = check_name_in_list(production_companies, x['name'])
            if found:
                movies_companies.append({
                    'movie_id': data['id'],
                    'company_id': position['id'],
                })                
            else:                    
                movies_companies.append({
                    'movie_id': data['id'],
                    'company_id': x['id'],
                })
                production_companies.append(x)
movies = movies.drop('production_companies', axis=1)

100%|████████████████████████████████████| 45433/45433 [03:54<00:00, 193.65it/s]


In [14]:
company = pd.DataFrame(production_companies).sort_values('id').drop_duplicates('id')
company.head()

Unnamed: 0,name,id
7,Lucasfilm,1
51,Walt Disney Pictures,2
9,Pixar Animation Studios,3
10,Paramount Pictures,4
45,Columbia Pictures,5


In [15]:
movies_companies = pd.DataFrame(movies_companies).drop_duplicates()
movies_companies.head()

Unnamed: 0,movie_id,company_id
0,2,2303
1,2,2396
2,3,2303
3,5,14
4,5,59


### Production countries

In [16]:
production_countries = []
movies_prod_countries = []
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    if data['production_countries'] != '':
        tmp = ast.literal_eval(data['production_countries'])
        for x in tmp:
            movies_prod_countries.append({
                'movie_id': data['id'],
                'country_id': x['iso_3166_1'],
            })
            production_countries.append(x)
movies = movies.drop('production_countries', axis=1)

100%|███████████████████████████████████| 45433/45433 [00:09<00:00, 4959.66it/s]


In [17]:
countries = pd.DataFrame(production_countries).sort_values('iso_3166_1').drop_duplicates('iso_3166_1')
countries.at[countries['iso_3166_1'] == 'CD', 'name'] = 'Congo, the Democratic Republic of the'
countries.head()

Unnamed: 0,iso_3166_1,name
36535,AE,United Arab Emirates
43061,AF,Afghanistan
45308,AL,Albania
40191,AM,Armenia
38993,AN,Netherlands Antilles


In [18]:
movies_countries = pd.DataFrame(movies_prod_countries).drop_duplicates()
movies_countries.head()

Unnamed: 0,movie_id,country_id
0,2,FI
1,3,FI
2,5,US
3,6,JP
4,6,US


### Spoken languages

In [19]:
spoken_languages = []
movies_spoken_languages = []
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    if data['spoken_languages'] != '':
        tmp = ast.literal_eval(data['spoken_languages'])
        for x in tmp:
            movies_spoken_languages.append({
                'movie_id': data['id'],
                'language_id': x['iso_639_1'],
            })
            spoken_languages.append(x)
movies = movies.drop('spoken_languages', axis=1)

100%|███████████████████████████████████| 45433/45433 [00:07<00:00, 6302.45it/s]


In [20]:
languages = pd.DataFrame(spoken_languages).sort_values('iso_639_1').drop_duplicates('iso_639_1')
languages.head()

Unnamed: 0,iso_639_1,name
47548,ab,
1274,af,Afrikaans
36875,am,
8625,ar,العربية
10013,as,


In [21]:
spoken_languages = pd.DataFrame(movies_spoken_languages).drop_duplicates()
spoken_languages.head()

Unnamed: 0,movie_id,language_id
0,2,fi
1,2,de
2,3,en
3,3,fi
4,3,sv


### Keywords

In [22]:
keywords = pd.read_csv(KEYWORDS_CSV, keep_default_na=False)
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [23]:
keywords_list = []
movies_keywords = []
for label, data  in tqdm(keywords.iterrows(), total=keywords.shape[0]):
    if data['keywords'] != '':
        tmp = ast.literal_eval(data['keywords'])
        for x in tmp:
            movies_keywords.append({
                'movie_id': data['id'],
                'keyword_id': x['id'],
            })
            keywords_list.append({
                'id': x['id'],
                'name': x['name'],
            })

100%|███████████████████████████████████| 46419/46419 [00:13<00:00, 3428.58it/s]


In [24]:
keywords = pd.DataFrame(keywords_list).sort_values('id').drop_duplicates('name')
keywords.head()

Unnamed: 0,id,name
20195,30,individual
78924,65,holiday
88272,74,germany
85300,75,gunslinger
17875,83,saving the world


In [25]:
movies_keywords = pd.DataFrame(movies_keywords).drop_duplicates()
movies_keywords.head()

Unnamed: 0,movie_id,keyword_id
0,862,931
1,862,4290
2,862,5202
3,862,6054
4,862,9713


### Credits

In [26]:
credits = pd.read_csv(CREDITS_CSV)
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


#### Cast

In [27]:
cast = []
person = []
for label, data  in tqdm(credits.iterrows(), total=credits.shape[0]):
    if data['cast'] != '':
        tmp = ast.literal_eval(data['cast'])
        for x in tmp:
            cast.append({
                'movie_id': data['id'],
                'character': x['character'],
                'person_id': x['id'],
                'order': x['order'],
            })
            identity = x.get('name', '').split(' ', 1)
            name = identity[0]
            if len(identity) > 1:
                surname = identity[1]
            else:
                surname = ''
            person.append({
                'id': x['id'],
                'name': name,
                'surname': surname,
                'gender': x['gender'],
            })

100%|████████████████████████████████████| 45476/45476 [00:51<00:00, 889.12it/s]


In [28]:
cast = pd.DataFrame(cast).drop_duplicates()
cast.head()

Unnamed: 0,movie_id,character,person_id,order
0,862,Woody (voice),31,0
1,862,Buzz Lightyear (voice),12898,1
2,862,Mr. Potato Head (voice),7167,2
3,862,Slinky Dog (voice),12899,3
4,862,Rex (voice),12900,4


#### Crew

In [29]:
crew = []
for label, data  in tqdm(credits.iterrows(), total=credits.shape[0]):
    if data['cast'] != '':
        tmp = ast.literal_eval(data['crew'])
        for x in tmp:
            crew.append({
                'movie_id': data['id'],
                'department': x['department'],
                'job': x['job'],
                'person_id': x['id'],
            })
            identity = x.get('name', '').split(' ', 1)
            name = identity[0]
            if len(identity) > 1:
                surname = identity[1]
            else:
                surname = ''
            person.append({
                'id': x['id'],
                'name': name,
                'surname': surname,
                'gender': x['gender'],
            })

100%|███████████████████████████████████| 45476/45476 [00:44<00:00, 1014.30it/s]


In [30]:
crew = pd.DataFrame(crew).drop_duplicates()
crew.head()

Unnamed: 0,movie_id,department,job,person_id
0,862,Directing,Director,7879
1,862,Writing,Screenplay,12891
2,862,Writing,Screenplay,7
3,862,Writing,Screenplay,12892
4,862,Writing,Screenplay,12893


In [31]:
people = pd.DataFrame(person).sort_values('id').drop_duplicates('id')
people.head()

Unnamed: 0,id,name,surname,gender
161631,1,George,Lucas,2
195489,2,Mark,Hamill,2
340878,3,Harrison,Ford,2
20669,4,Carrie,Fisher,1
142927,5,Peter,Cushing,2


### Links

In [32]:
links = pd.read_csv(LINKS_CSV, keep_default_na=False, low_memory=True)
links['tmdbId'] = pd.to_numeric(links.tmdbId, downcast='integer')
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


### Ratings

In [34]:
def process_rating_chunk(ratings):
    ratings = ratings.merge(links, left_on='movieId', right_on='movieId')
    ratings = ratings.rename(columns={
        'userId': 'user_id',
        'tmdbId': 'movie_id',
        'timestamp': 'date',
    })
    ratings = ratings.sort_values(['date'])
    ratings['date'] = pd.to_datetime(ratings['date'], unit='s')
    
    ratings = ratings.drop('movieId', axis=1)
    ratings = ratings.drop('imdbId', axis=1)
#     ratings['rating'] = ratings.rating.apply(lambda x: x * 2)

    return ratings

In [None]:
ratings = pd.read_csv(RATINGS_CSV, keep_default_na=False, low_memory=True)

ratings = process_rating_chunk(ratings)
ratings = ratings[ratings['movie_id'].isin(movies['id'])]
ratings = ratings.drop_duplicates(subset=['user_id', 'movie_id', ], keep='last')
ratings

In [None]:
ratings.astype({'rating': 'int32'}).groupby(['rating']).size()

In [None]:
ax = ratings.groupby(['rating']).size().plot.bar(title='Rozkład ocen')
ax.set_xlabel('Oceny')
ax.set_ylabel('Liczba ocen w mln')
ax.get_figure().savefig('rozkład ocen.png')

In [68]:
ratings

Unnamed: 0,user_id,rating,date,movie_id
20925665,38150,8.0,1995-01-09 11:46:44,1600.0
12937605,44717,6.0,1995-01-09 11:46:49,8012.0
4842674,44717,6.0,1995-01-09 11:46:49,623.0
3870682,44717,10.0,1995-01-09 11:46:49,807.0
6295319,187396,10.0,1996-01-29 00:00:00,9598.0
...,...,...,...,...
11605097,85434,4.0,2017-08-04 06:38:59,13342.0
23907954,85434,6.0,2017-08-04 06:39:06,10140.0
19982064,85434,8.0,2017-08-04 06:41:23,19186.0
20803205,199634,6.0,2017-08-04 06:53:43,1724.0


# Pobranie zdjęć

In [35]:
gallery = []
photos = []
gallery_id = 1

def download_poster(folderToSave, poster_path) -> (str, int):
    imgUrl = f'{URL_IMAGE}{poster_path}'
    name = str(uuid4())
    try:
        image = f'{folderToSave}/{name}'
        session = requests.Session()
        r = session.get(imgUrl, stream=True, verify=False)
        if r.status_code == 200:
            with open(image, 'wb') as f:
                for chunk in r.iter_content(chunk_size=1024):
                    f.write(chunk)
            return (name, os.path.getsize(image))
        else:
            raise Exception()
    except Exception as e:
        return ('00000000-0000-0000-0000-000000000000', 0)

## Movies

In [36]:
size = 0
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    gallery.append(gallery_id)
    try:
        if size < MAX_FOLDER_IMAGE_GB * 1024**3:
            response = requests.get(f'{URL_API}movie/{data["id"]}?api_key={API_KEY}')
            response = response.json()
            poster_path = response['poster_path']

            image, s = download_poster(MKDIR_MOVIES, poster_path)
            photos.append({
                'url': image,
                'gallery_id': gallery_id,
                'order': 1,
            })
            photos.append({
                'url': '00000000-0000-0000-0000-000000000000',
                'gallery_id': gallery_id,
                'order': 2,
            })
            size = size + s
        else:
            raise Exception()
    except:
        photos.append({
            'url': '00000000-0000-0000-0000-000000000000',
            'gallery_id': gallery_id,
            'order': 1,
        })
        photos.append({
            'url': '00000000-0000-0000-0000-000000000000',
            'gallery_id': gallery_id,
            'order': 2,
        })
    finally:
        gallery_id = gallery_id + 1        
movies['gallery_id'] = gallery
movies

100%|████████████████████████████████████| 45433/45433 [02:05<00:00, 361.03it/s]


Unnamed: 0,adult,collection_id,budget,homepage,id,original_language_id,original_title,overview,release_date,revenue,runtime,status_id,tagline,title,video,gallery_id
4342,False,,0,,2,fi,Ariel,Taisto Kasurinen is a Finnish coal miner whose...,1988-10-21,0,69.0,Released,,Ariel,False,1
12947,False,,0,,3,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",1986-10-16,0,76.0,Released,,Shadows in Paradise,False,2
17,False,,4000000,,5,en,Four Rooms,It's Ted the Bellhop's first night on the job....,1995-12-09,4300000,98.0,Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,3
474,False,,0,,6,en,Judgment Night,"While racing to a boxing match, Frank, Mike, J...",1993-10-15,12136938,110.0,Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,False,4
256,False,10,11000000,http://www.starwars.com/films/star-wars-episod...,11,en,Star Wars,Princess Leia is captured and held hostage by ...,1977-05-25,775398007,121.0,Released,"A long time ago in a galaxy far, far away...",Star Wars,False,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45075,False,,0,,465044,en,Abduction,A horror comedy spoofing conspiracy theory mov...,2017-06-28,0,90.0,Released,Horrifically Funny,Abduction,False,45429
45270,False,,0,,467731,en,Tragedy in a Temporary Town,Fifteen-year-old girl Dotty Fisher is assaulte...,1956-02-19,0,60.0,Released,,Tragedy in a Temporary Town,False,45430
21890,False,,0,,468343,fi,Silja - nuorena nukkunut,"In the 1910s, beautiful young Silja loses both...",1956-01-01,0,87.0,Released,,Silja - nuorena nukkunut,False,45431
45395,False,,1254040,http://lmtr.fi/,468707,fi,Lauri Mäntyvaaran tuuheet ripset,,2017-07-28,0,90.0,Released,,Thick Lashes of Lauri Mäntyvaara,False,45432


## Collection

In [37]:
size = 0
gallery_last = gallery[-1]
for label, data  in tqdm(collection.iterrows(), total=collection.shape[0]):
    gallery.append(gallery_id)
    try:
        if size < MAX_FOLDER_IMAGE_GB * 1024**3:
            response = requests.get(f'{URL_API}collection/{data["id"]}?api_key={API_KEY}')
            if response.status_code == 200:
                response = response.json()
                image, s = download_poster(MKDIR_COLLECTIONS, response['poster_path'])
                photos.append({
                    'url': image,
                    'gallery_id': gallery_id,
                    'order': 1,
                })
                size = size + s
                image, s = download_poster(MKDIR_COLLECTIONS, response['backdrop_path'])
                photos.append({
                    'url': image,
                    'gallery_id': gallery_id,
                    'order': 2,
                })
                size = size + s
            else:
                raise Exception()
        else:
            raise Exception()
    except Exception:
        photos.append({
            'url': '00000000-0000-0000-0000-000000000000',
            'gallery_id': gallery_id,
            'order': 1,
        })
        photos.append({
            'url': '00000000-0000-0000-0000-000000000000',
            'gallery_id': gallery_id,
            'order': 2,
        })
    finally:
        gallery_id = gallery_id + 1
collection['gallery_id'] = gallery[gallery_last:]
collection

100%|███████████████████████████████████████| 1695/1695 [02:12<00:00, 12.75it/s]


Unnamed: 0,id,name,gallery_id
0,10,Star Wars Collection,45434
17,84,Indiana Jones Collection,45435
28,119,The Lord of the Rings Collection,45436
25,131,Three Colors Collection,45437
37,151,Star Trek: The Original Series Collection,45438
...,...,...,...
4382,479692,Robert,47124
241,479888,The Thing Collection,47125
3849,479971,Sağ Salim Serisi,47126
1996,480071,"Maria, ihm schmeckt's nicht!",47127


## People

In [38]:
size = 0
gallery_last = gallery[-1]
for label, data  in tqdm(people.iterrows(), total=people.shape[0]):
    gallery.append(gallery_id)
    try:
        if size < MAX_FOLDER_IMAGE_GB * 1024**3:
            response = requests.get(f'{URL_API}person/{data["id"]}?api_key={API_KEY}')
            if response.status_code == 200:
                response = response.json()
                image, s = download_poster(MKDIR_PEOPLE, response['profile_path'])
                photos.append({
                    'url': image,
                    'gallery_id': gallery_id,
                    'order': 1,
                })
                size = size + s
            else:
                raise Exception()
        else:
            raise Exception()
    except Exception as e:
        photos.append({
            'url': '00000000-0000-0000-0000-000000000000',
            'gallery_id': gallery_id,
            'order': 1,
        })
    finally:
        gallery_id = gallery_id + 1
people['gallery_id'] = gallery[gallery_last:]
people

100%|██████████████████████████████████| 353343/353343 [07:34<00:00, 777.04it/s]


Unnamed: 0,id,name,surname,gender,gallery_id
161631,1,George,Lucas,2,47129
195489,2,Mark,Hamill,2,47130
340878,3,Harrison,Ford,2,47131
20669,4,Carrie,Fisher,1,47132
142927,5,Peter,Cushing,2,47133
...,...,...,...,...,...
528206,1907923,Pia,Edlund,1,400467
528207,1907926,Stina,Elinderson,0,400468
922497,1907940,Mark,Devries,0,400469
1001848,1908001,Joe,Barton,2,400470


In [39]:
photos = pd.DataFrame(photos)
photos.head()

Unnamed: 0,url,gallery_id,order
0,a19c1a68-e9b5-4af0-9b7b-175ab593501f,1,1
1,00000000-0000-0000-0000-000000000000,1,2
2,b49c5b5b-c969-4b86-b06c-6ee8a71b385f,2,1
3,00000000-0000-0000-0000-000000000000,2,2
4,c7df5eaa-476e-44be-aa45-e6b6eb9b68bd,3,1


In [40]:
gallery_table = pd.DataFrame({
    'id': gallery,
})
gallery_table.head()

Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5


# Ustawienie wartości domyślnych

## Movies

In [41]:
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    if data['budget'] == 0:
        movies.at[label, 'budget'] = None        
    if data['homepage'] == '':
        movies.at[label, 'homepage'] = None
    if data['title'] == data['original_title']:
        movies.at[label, 'original_title'] = ''        
    if data['original_language_id'] == '':
        movies.at[label, 'original_language_id'] = None
    if data['release_date'] == '':
        movies.at[label, 'release_date'] = None
    if data['revenue'] == '':
        movies.at[label, 'revenue'] = None
    if data['runtime'] == '':
        movies.at[label, 'runtime'] = None
    if data['status_id'] == '':
        movies.at[label, 'status_id'] = 'Unknown'
movies['visibility'] = True
movies.head()

100%|███████████████████████████████████| 45433/45433 [00:09<00:00, 4669.28it/s]


Unnamed: 0,adult,collection_id,budget,homepage,id,original_language_id,original_title,overview,release_date,revenue,runtime,status_id,tagline,title,video,gallery_id,visibility
4342,False,,,,2,fi,,Taisto Kasurinen is a Finnish coal miner whose...,1988-10-21,0,69.0,Released,,Ariel,False,1,True
12947,False,,,,3,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",1986-10-16,0,76.0,Released,,Shadows in Paradise,False,2,True
17,False,,4000000.0,,5,en,,It's Ted the Bellhop's first night on the job....,1995-12-09,4300000,98.0,Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,3,True
474,False,,,,6,en,,"While racing to a boxing match, Frank, Mike, J...",1993-10-15,12136938,110.0,Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,False,4,True
256,False,10.0,11000000.0,http://www.starwars.com/films/star-wars-episod...,11,en,,Princess Leia is captured and held hostage by ...,1977-05-25,775398007,121.0,Released,"A long time ago in a galaxy far, far away...",Star Wars,False,5,True


## Przeliczenie średniej ocen

In [42]:
movies = movies.merge(
    ratings[ratings.movie_id.isin(movies.id.unique()[movies.status_id == 'Released'])].groupby(['movie_id']).agg(
        average_vote=pd.NamedAgg(column='rating', aggfunc=np.mean),
        count_vote=pd.NamedAgg(column='rating', aggfunc='count'),
    ), left_on='id', right_on='movie_id', how='left'
)

In [43]:
movies[~movies.id.isin(ratings.groupby(['movie_id']).agg(
    average_vote=pd.NamedAgg(column='rating', aggfunc=np.mean),
    count_vote=pd.NamedAgg(column='rating', aggfunc='count'),).index.tolist())
]

Unnamed: 0,adult,collection_id,budget,homepage,id,original_language_id,original_title,overview,release_date,revenue,runtime,status_id,tagline,title,video,gallery_id,visibility,average_vote,count_vote
620,False,,,,798,de,,Pünktchen und Anton is a 1953 German film base...,1953-08-26,0,91.0,Released,,Pünktchen und Anton,False,621,True,,
1166,False,,,,1848,en,,Private eye Philip Marlowe and his bride move ...,1998-07-25,0,92.0,Released,Sex. Murder. Marlowe. This town has everything.,Poodle Springs,False,1167,True,,
1879,False,177062,,,3941,en,,The 8th film in the Blondie series - Blondie ...,1941-02-27,0,72.0,Released,Where there's a Bumstead...there's always trou...,Blondie Goes Latin,False,1880,True,,
2026,False,,,,4529,en,,"Professor Bower, an American physicist, is eff...",1966-10-20,0,106.0,Released,,The Defector,False,2027,True,,
2081,False,,,,4709,fr,La Fête à Henriette,Two scriptwriters argue about the fate of Henr...,1952-12-17,0,118.0,Released,,Holiday for Henrietta,False,2082,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45117,False,,,,429392,en,,Ian Harvie is not quite the kind of man you mi...,2016-12-29,0,58.0,Released,,Ian Harvie: May the Best Cock Win,False,45118,True,,
45125,False,,,,429788,en,,Armed with his ferociously aggressive style of...,2016-11-24,0,59.0,Released,,Aries Spears: Comedy Blueprint,False,45126,True,,
45126,False,,,http://www.sho.com/titles/3437477/tony-roberts...,429792,en,,Detroit native Tony T. Roberts returns to his ...,2016-12-02,0,50.0,Released,,Tony Roberts: Motorcity Motormouth,False,45127,True,,
45129,False,,,,429803,en,,The Live Series is an exclusive comedy special...,2015-02-17,0,0.0,Released,,Barry Hilton: The Live Series,False,45130,True,,


In [44]:
movies

Unnamed: 0,adult,collection_id,budget,homepage,id,original_language_id,original_title,overview,release_date,revenue,runtime,status_id,tagline,title,video,gallery_id,visibility,average_vote,count_vote
0,False,,,,2,fi,,Taisto Kasurinen is a Finnish coal miner whose...,1988-10-21,0,69.0,Released,,Ariel,False,1,True,7.347328,262.0
1,False,,,,3,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",1986-10-16,0,76.0,Released,,Shadows in Paradise,False,2,True,7.540230,87.0
2,False,,4000000.0,,5,en,,It's Ted the Bellhop's first night on the job....,1995-12-09,4300000,98.0,Released,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,3,True,6.818062,6090.0
3,False,,,,6,en,,"While racing to a boxing match, Frank, Mike, J...",1993-10-15,12136938,110.0,Released,Don't move. Don't whisper. Don't even breathe.,Judgment Night,False,4,True,5.849725,1271.0
4,False,10,11000000.0,http://www.starwars.com/films/star-wars-episod...,11,en,,Princess Leia is captured and held hostage by ...,1977-05-25,775398007,121.0,Released,"A long time ago in a galaxy far, far away...",Star Wars,False,5,True,8.264599,77045.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45428,False,,,,465044,en,,A horror comedy spoofing conspiracy theory mov...,2017-06-28,0,90.0,Released,Horrifically Funny,Abduction,False,45429,True,1.000000,1.0
45429,False,,,,467731,en,,Fifteen-year-old girl Dotty Fisher is assaulte...,1956-02-19,0,60.0,Released,,Tragedy in a Temporary Town,False,45430,True,7.000000,1.0
45430,False,,,,468343,fi,,"In the 1910s, beautiful young Silja loses both...",1956-01-01,0,87.0,Released,,Silja - nuorena nukkunut,False,45431,True,6.500000,4.0
45431,False,,1254040.0,http://lmtr.fi/,468707,fi,Lauri Mäntyvaaran tuuheet ripset,,2017-07-28,0,90.0,Released,,Thick Lashes of Lauri Mäntyvaara,False,45432,True,8.000000,1.0


## Rozdzielenie Movies

In [45]:
status = []
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    if not data['status_id'] in status:
        status.append(data['status_id'])
    movies.at[label, 'status_id'] = status.index(data['status_id']) + 1
movies.head()

100%|███████████████████████████████████| 45433/45433 [00:05<00:00, 7944.08it/s]


Unnamed: 0,adult,collection_id,budget,homepage,id,original_language_id,original_title,overview,release_date,revenue,runtime,status_id,tagline,title,video,gallery_id,visibility,average_vote,count_vote
0,False,,,,2,fi,,Taisto Kasurinen is a Finnish coal miner whose...,1988-10-21,0,69.0,1,,Ariel,False,1,True,7.347328,262.0
1,False,,,,3,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",1986-10-16,0,76.0,1,,Shadows in Paradise,False,2,True,7.54023,87.0
2,False,,4000000.0,,5,en,,It's Ted the Bellhop's first night on the job....,1995-12-09,4300000,98.0,1,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,3,True,6.818062,6090.0
3,False,,,,6,en,,"While racing to a boxing match, Frank, Mike, J...",1993-10-15,12136938,110.0,1,Don't move. Don't whisper. Don't even breathe.,Judgment Night,False,4,True,5.849725,1271.0
4,False,10.0,11000000.0,http://www.starwars.com/films/star-wars-episod...,11,en,,Princess Leia is captured and held hostage by ...,1977-05-25,775398007,121.0,1,"A long time ago in a galaxy far, far away...",Star Wars,False,5,True,8.264599,77045.0


In [46]:
status = pd.DataFrame({
    'id': range(1, len(status) + 1),
    'name':status,
})
status.head()

Unnamed: 0,id,name
0,1,Released
1,2,In Production
2,3,Rumored
3,4,Planned
4,5,Unknown


### Zamiana kolejności statusów

In [47]:
new_order = {
    1: 2, 
    2: 1
}
for label, data  in tqdm(movies.iterrows(), total=movies.shape[0]):
    if data['status_id'] in new_order.keys():
        movies.at[label, 'status_id'] = new_order[data['status_id']]
movies.head()

100%|███████████████████████████████████| 45433/45433 [00:05<00:00, 8882.93it/s]


Unnamed: 0,adult,collection_id,budget,homepage,id,original_language_id,original_title,overview,release_date,revenue,runtime,status_id,tagline,title,video,gallery_id,visibility,average_vote,count_vote
0,False,,,,2,fi,,Taisto Kasurinen is a Finnish coal miner whose...,1988-10-21,0,69.0,2,,Ariel,False,1,True,7.347328,262.0
1,False,,,,3,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",1986-10-16,0,76.0,2,,Shadows in Paradise,False,2,True,7.54023,87.0
2,False,,4000000.0,,5,en,,It's Ted the Bellhop's first night on the job....,1995-12-09,4300000,98.0,2,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,3,True,6.818062,6090.0
3,False,,,,6,en,,"While racing to a boxing match, Frank, Mike, J...",1993-10-15,12136938,110.0,2,Don't move. Don't whisper. Don't even breathe.,Judgment Night,False,4,True,5.849725,1271.0
4,False,10.0,11000000.0,http://www.starwars.com/films/star-wars-episod...,11,en,,Princess Leia is captured and held hostage by ...,1977-05-25,775398007,121.0,2,"A long time ago in a galaxy far, far away...",Star Wars,False,5,True,8.264599,77045.0


In [48]:
for label, data  in tqdm(status.iterrows(), total=status.shape[0]):
    if data['id'] in new_order.keys():
        status.at[label, 'id'] = new_order[data['id']]
status.sort_values('id').head()

100%|████████████████████████████████████████████| 7/7 [00:00<00:00, 353.02it/s]


Unnamed: 0,id,name
1,1,In Production
0,2,Released
2,3,Rumored
3,4,Planned
4,5,Unknown


## Zmiana statusu

In [49]:
movies.loc[(movies['id'] == 48259) | (movies['id'] == 44265), 'status_id'] = 2

In [50]:
movies[((movies.status_id == 4) | (movies.status_id == 1) | (movies.status_id == 6)) & (movies.release_date.notnull())].sort_values('release_date')

Unnamed: 0,adult,collection_id,budget,homepage,id,original_language_id,original_title,overview,release_date,revenue,runtime,status_id,tagline,title,video,gallery_id,visibility,average_vote,count_vote
10790,False,,,,25073,en,,A screenwriter who travels to an abandoned hou...,2009-01-01,0,89.0,1,,Deadline,False,10791,True,,
21653,False,102940,,http://breakblade.jp/,55438,en,Break Blade 6: Doukoku no Toride,Baldr and his men carries Delphine back to the...,2011-03-25,0,50.0,6,,Broken Blade: Book Six - Enclave of Lamentations,False,21654,True,,
22965,False,101646,,,61123,fi,Vares – Huhtikuun tytöt,Tough Finnish detective Jussi Vares gets hired...,2011-04-20,0,95.0,1,,Vares - The Girls of April,False,22966,True,,
21118,False,,,http://thelastmountainmovie.com/,53328,en,,"In the valleys of Appalachia, a battle is bein...",2011-06-03,90425,95.0,6,A Fight For Our Future,The Last Mountain,False,21119,True,,
22108,False,,,http://www.magictripmovie.com/,57211,en,,A freewheeling portrait of Ken Kesey and the M...,2011-08-05,0,107.0,6,A drug-fuelled road trip in the 60s,Magic Trip,False,22109,True,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44762,False,,,,412059,en,,"In forgotten towns along the American border, ...",2018-04-04,0,105.0,6,,Mobile Homes,False,44763,True,,
42108,False,,,,332283,en,,The love affair between poet Percy Shelley and...,2018-04-25,0,0.0,6,,Mary Shelley,False,42109,True,,
15842,False,14890,,,38700,en,,The continuing adventures of Miami detectives ...,2018-11-07,0,0.0,4,,Bad Boys for Life,False,15843,True,,
40973,False,,12000000.0,,299782,en,,"Orson Welles' unfinished masterpiece, restored...",2018-12-31,0,0.0,6,,The Other Side of the Wind,False,40974,True,,


# Oczyszczenie danych

## Movies

In [51]:
movies['homepage'] = movies.homepage.str.strip()
movies['original_title'] = movies.original_title.str.strip()
movies['overview'] = movies.overview.str.strip()
movies['tagline'] = movies.tagline.str.strip()
movies['title'] = movies.title.str.strip()

movies.head()

Unnamed: 0,adult,collection_id,budget,homepage,id,original_language_id,original_title,overview,release_date,revenue,runtime,status_id,tagline,title,video,gallery_id,visibility,average_vote,count_vote
0,False,,,,2,fi,,Taisto Kasurinen is a Finnish coal miner whose...,1988-10-21,0,69.0,2,,Ariel,False,1,True,7.347328,262.0
1,False,,,,3,fi,Varjoja paratiisissa,"An episode in the life of Nikander, a garbage ...",1986-10-16,0,76.0,2,,Shadows in Paradise,False,2,True,7.54023,87.0
2,False,,4000000.0,,5,en,,It's Ted the Bellhop's first night on the job....,1995-12-09,4300000,98.0,2,Twelve outrageous guests. Four scandalous requ...,Four Rooms,False,3,True,6.818062,6090.0
3,False,,,,6,en,,"While racing to a boxing match, Frank, Mike, J...",1993-10-15,12136938,110.0,2,Don't move. Don't whisper. Don't even breathe.,Judgment Night,False,4,True,5.849725,1271.0
4,False,10.0,11000000.0,http://www.starwars.com/films/star-wars-episod...,11,en,,Princess Leia is captured and held hostage by ...,1977-05-25,775398007,121.0,2,"A long time ago in a galaxy far, far away...",Star Wars,False,5,True,8.264599,77045.0


## Collections

In [52]:
collection['name'] = collection.name.str.strip()
collection.head()

Unnamed: 0,id,name,gallery_id
0,10,Star Wars Collection,45434
17,84,Indiana Jones Collection,45435
28,119,The Lord of the Rings Collection,45436
25,131,Three Colors Collection,45437
37,151,Star Trek: The Original Series Collection,45438


## Genres

In [53]:
genres['name'] = genres.name.str.strip()
genres.head()

Unnamed: 0,id,name
6784,12,Adventure
65263,14,Fantasy
37860,16,Animation
29936,18,Drama
19538,27,Horror


## Countries

In [54]:
countries['name'] = countries.name.str.strip()
countries.head()

Unnamed: 0,iso_3166_1,name
36535,AE,United Arab Emirates
43061,AF,Afghanistan
45308,AL,Albania
40191,AM,Armenia
38993,AN,Netherlands Antilles


## Languages

In [55]:
languages['name'] = languages.name.str.strip()
languages.head()

Unnamed: 0,iso_639_1,name
47548,ab,
1274,af,Afrikaans
36875,am,
8625,ar,العربية
10013,as,


## Cast

In [56]:
cast['character'] = cast.character.str.strip()
cast = cast[cast.character != '']
cast.head()

Unnamed: 0,movie_id,character,person_id,order
0,862,Woody (voice),31,0
1,862,Buzz Lightyear (voice),12898,1
2,862,Mr. Potato Head (voice),7167,2
3,862,Slinky Dog (voice),12899,3
4,862,Rex (voice),12900,4


## Crew

In [57]:
crew['department'] = crew.department.str.strip()
crew['job'] = crew.job.str.strip()

crew.head()

Unnamed: 0,movie_id,department,job,person_id
0,862,Directing,Director,7879
1,862,Writing,Screenplay,12891
2,862,Writing,Screenplay,7
3,862,Writing,Screenplay,12892
4,862,Writing,Screenplay,12893


## People

In [58]:
people['name'] = people.name.str.strip()
people['surname'] = people.surname.str.strip()

people.head()

Unnamed: 0,id,name,surname,gender,gallery_id
161631,1,George,Lucas,2,47129
195489,2,Mark,Hamill,2,47130
340878,3,Harrison,Ford,2,47131
20669,4,Carrie,Fisher,1,47132
142927,5,Peter,Cushing,2,47133


# Zapisanie danych do bazy danych

## Przygotowanie połączenia

In [59]:
engine = create_engine(f'mysql+pymysql://{DB_USER}:{DB_PASS}@{DB_HOST}/{DB_NAME}')

### Zapisanie tabel

In [60]:
list_tabels = [
    { 'nazwa_tabeli': 'photos_gallery', 'data': gallery_table, },
    { 'nazwa_tabeli': 'photos_photo', 'data': photos, },
    { 'nazwa_tabeli': 'movies_collection', 'data': collection, },
    { 'nazwa_tabeli': 'movies_movie_status', 'data': status, },
    { 'nazwa_tabeli': 'movies_language', 'data': languages, },
    { 'nazwa_tabeli': 'movies_country', 'data': countries, },
    { 'nazwa_tabeli': 'movies_company', 'data': company, },
    { 'nazwa_tabeli': 'movies_keyword', 'data': keywords, },
    { 'nazwa_tabeli': 'movies_genre', 'data': genres, },
    { 'nazwa_tabeli': 'movies_movie', 'data': movies, },
    { 'nazwa_tabeli': 'movies_movie_genres', 'data': movies_genres, },
    { 'nazwa_tabeli': 'movies_movie_keywords', 'data': movies_keywords, },
    { 'nazwa_tabeli': 'movies_movie_production_companies', 'data': movies_companies, },
    { 'nazwa_tabeli': 'movies_movie_production_countries', 'data': movies_countries, },
    { 'nazwa_tabeli': 'movies_movie_spoken_languages', 'data': spoken_languages, },
    { 'nazwa_tabeli': 'movies_person', 'data': people, },
    { 'nazwa_tabeli': 'movies_crew', 'data': crew, },
    { 'nazwa_tabeli': 'movies_cast', 'data': cast, },
]

for x in tqdm(list_tabels):
    x['data'].to_sql(f'{x["nazwa_tabeli"]}', con=engine, if_exists='append', index=False, method='multi', chunksize=1000)


100%|███████████████████████████████████████████| 18/18 [08:54<00:00, 29.71s/it]
