# Movie Recommendation System

## Cel projektu

- Zbudowanie systemu rekomendacji filmów na podstawie ocen użytkowników.
- Wykorzystanie SQL do analizy danych i Python do implementacji modelu rekomendacji.

## Wczytywanie danych

In [36]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import seaborn as sns
import matplotlib.pyplot as plt

In [38]:
movies_metadata = pd.read_csv("../input/movies_metadata.csv", low_memory=False) # wyłaczenie trybu oszczedzania pamięci
credits = pd.read_csv("../input/credits.csv")
keywords = pd.read_csv("../input/keywords.csv")
links = pd.read_csv("../input/links.csv")
links_small = pd.read_csv("../input/links_small.csv")
ratings = pd.read_csv("../input/ratings.csv")
ratings_small = pd.read_csv("../input/ratings_small.csv")

### Analiza struktury danych

- movies_metadata.csv  zawiera informacje o filmach takie jak tytuł, data, premiery, budżet itp.
- credits.csv  zawiera informacje o obsadzie i ekipie filmowej
- ratings.csv  zawiera oceny filmów wystawione przez użytkowników
- keywords.csv  zawiera słowa kluczowe
- links.csv  zawiera identyfikatory filmów w róznych bazach danych  (np. IMDb, TMDb)

Format JSON który zastoswałem podczas tworzenia tabeli jest nie poprawny poniewaz w standardach JSON używany jest  podwójny cudzysłów a użyty jest pojedynczy cudzysłów.



In [42]:
def fix_json_format(value):
    if isinstance(value,str):
        value = value.replace("'",'"')
    return value

movies_metadata_json = movies_metadata.map(fix_json_format)
credits_json = credits.map(fix_json_format)
keywords_json = keywords.map(fix_json_format)
links_json = links.map(fix_json_format)
links_small_json = links_small.map(fix_json_format)
ratings_json = ratings.map(fix_json_format)
ratings_small_json = ratings_small.map(fix_json_format)


In [43]:
movies_metadata_json.to_csv("../input/movies_metadata_json.csv", index=False)
credits_json.to_csv("../input/credits_json.csv", index=False)
keywords_json.to_csv("../input/keywords_json.csv", index=False)
links_json.to_csv("../input/links_json.csv", index=False)
links_small_json.to_csv("../input/links_small_json.csv", index=False)
ratings_json.to_csv("../input/ratings_json.csv", index=False)
ratings_small_json.to_csv("../input/ratings_small_json.csv", index=False)

In [44]:
movies_metadata_json = pd.read_csv("../input/movies_metadata_json.csv", low_memory=False) # wyłaczenie trybu oszczedzania pamięci
credits_json = pd.read_csv("../input/credits_json.csv")
keywords_json = pd.read_csv("../input/keywords_json.csv")
links_json = pd.read_csv("../input/links_json.csv")
links_small_json = pd.read_csv("../input/links_small_json.csv")
ratings_json = pd.read_csv("../input/ratings_json.csv")
ratings_small_json = pd.read_csv("../input/ratings_small_json.csv")

#### movies_metadata

In [46]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

movies_metadata_json.iloc[0]

adult                                                                False
belongs_to_collection    {"id": 10194, "name": "Toy Story Collection", ...
budget                                                            30000000
genres                   [{"id": 16, "name": "Animation"}, {"id": 35, "...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy"s toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{"name": "Pixar Animation Studios", "id": 3}]
production_countries     

In [4]:
movies_metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

-- Tabela: movies_metadata

CREATE TABLE movies_metadata (
adult BOOLEAN,
belongs_to_collection JSON, 
budget BIGINT, 
genres JSON, 
homepage TEXT, 
id INTEGER PRIMARY KEY,
imdb_id VARCHAR(20), 
original_language VARCHAR(10), 
original_title TEXT, 
overviewt TEXT,
popularity FLOAT, 
poster_path BINARY, 
production_companies JSON,
production_countries JSON, 
release_date DATE, 
revenue BIGINT, 
runtime FLOAT,
spoken_languages JSON, 
status VARCHAR(20), 
tagline TEXT, 
title TEXT, 
video BOOLEAN,
vote_average FLOAT, 
vote_count INTEGER
);

#### credits

In [5]:
credits.sample(5)

Unnamed: 0,cast,crew,id
6410,"[{'cast_id': 1, 'character': ""Danny O'Shea"", '...","[{'credit_id': '52fe43f7c3a368484e0083f3', 'de...",20726
16482,"[{'cast_id': 3, 'character': 'Atsushi', 'credi...","[{'credit_id': '58a6b3d192514174420056d1', 'de...",25051
10849,"[{'cast_id': 1, 'character': 'Leprechaun', 'cr...","[{'credit_id': '52fe47cc9251416c750a5dbf', 'de...",19287
34199,"[{'cast_id': 1, 'character': 'Sgt. Michael Ran...","[{'credit_id': '52fe497b9251416c910aeb2f', 'de...",86251
8909,"[{'cast_id': 6, 'character': 'Ashik Kerib', 'c...","[{'credit_id': '52fe49079251416c750bb4e1', 'de...",92663


In [8]:
credits['crew'].isnull().sum()

0

In [238]:
display(credits.head())

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [38]:
credits.columns

Index(['cast', 'crew', 'id'], dtype='object')

-- Tabela: credits
CREATE TABLE credits (
cast JSON, 
crew JSON, 
FOREIGN KEY (id) REFERENCES movies_metadata(id)
)

#### keywords

In [60]:
keywords.sample()

Unnamed: 0,id,keywords
26456,35986,"[{'id': 10291, 'name': 'organized crime'}, {'i..."


In [63]:
keywords.columns

Index(['id', 'keywords'], dtype='object')

-- Tabela: keywords
CREATE TABLE keywords (
FOREIGN KEY (id) REFERENCES movies_metadata(id)
keywords JSON
)

#### links

In [236]:
links.sample()

Unnamed: 0,movieId,imdbId,tmdbId
31774,138120,1457759,297173.0


In [70]:
links.columns

Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')

#### ratings

In [68]:
ratings.sample()

Unnamed: 0,userId,movieId,rating,timestamp
7911066,81533,4446,2.0,1073359461


In [72]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')