## Import library

In [2]:
import pandas as pd
import numpy as np

## Read data from .csv file

In [3]:
movies = pd.read_csv('dataset/movies.csv')
links = pd.read_csv('dataset/links.csv')
ratings = pd.read_csv('dataset/ratings.csv')
tags = pd.read_csv('dataset/tags.csv')

## Overview data

In [41]:
links.shape

(9742, 5)

In [5]:
ratings['userId'].unique

<bound method Series.unique of 0           1
1           1
2           1
3           1
4           1
         ... 
100831    610
100832    610
100833    610
100834    610
100835    610
Name: userId, Length: 100836, dtype: int64>

In [39]:
movies.shape

(9742, 3)

In [49]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB


In [6]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [21]:
links.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   imdbId   9742 non-null   int64  
 2   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2)
memory usage: 228.5 KB


In [10]:
links[['movieId', 'tmdbId']].head()

Unnamed: 0,movieId,tmdbId
0,1,862.0
1,2,8844.0
2,3,15602.0
3,4,31357.0
4,5,11862.0


In [52]:
ratings.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


In [53]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [54]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Merge data from moives table and links table to get poster of movies using tmdbId in links table

In [57]:
movies = movies.merge(links, on="movieId")

In [58]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 0 to 9741
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   movieId  9742 non-null   int64  
 1   title    9742 non-null   object 
 2   genres   9742 non-null   object 
 3   imdbId   9742 non-null   int64  
 4   tmdbId   9734 non-null   float64
dtypes: float64(1), int64(2), object(2)
memory usage: 456.7+ KB


In [10]:
movies.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,poster_url
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862,https://image.tmdb.org/t/p/w500//uXDfjJbdP4ijW...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844,https://image.tmdb.org/t/p/w500//6aGn2X51bahFo...
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602,https://image.tmdb.org/t/p/w500//1FSXpj5e8l4KH...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357,https://image.tmdb.org/t/p/w500//4uw6HKq4vlhrS...
4,5,Father of the Bride Part II (1995),Comedy,113041,11862,https://image.tmdb.org/t/p/w500//rj4LBtwQ0uGrp...


In [59]:
def convertFloatToInt(value):
    return int(value)

In [83]:
movies['tmdbId'] = movies['tmdbId'].fillna(0)

In [84]:
count_nan = movies['tmdbId'].isnull().sum()
count_nan

0

In [86]:
movies['tmdbId'] = movies['tmdbId'].apply(convertFloatToInt)

In [87]:
movies['tmdbId'].value_counts(ascending=False)

0        8
4912     2
862      1
3432     1
14207    1
        ..
3134     1
8217     1
22478    1
26483    1
37891    1
Name: tmdbId, Length: 9734, dtype: int64

### Get poster url and overview for moives by tmdbId through API to themoviedb.org

In [13]:
import requests
import json

In [66]:
def get_poster_url(movie_id):
    if(movie_id == 0):
        return ""
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US"
    try:
        data = requests.get(url)
        data = data.json()
        if 'poster_path' in data:
            poster_path = data['poster_path']
            full_path = "https://image.tmdb.org/t/p/w500/" + str(poster_path)
        else: full_path = ""
        return full_path
    except:
        return ""
get_poster_url(12509)

'https://image.tmdb.org/t/p/w500//azsBSw2zw2uNHiCjTnbe9TJVEDB.jpg'

In [70]:
movies['poster_url'] = movies['tmdbId']
movies['poster_url'] = movies['poster_url'].apply(get_poster_url)

In [93]:
def convertObjectToString(obj):
    return str(obj)

In [95]:
movies['poster_url'] = movies['poster_url'].apply(convertObjectToString)

In [96]:
movies.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,poster_url
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862,https://image.tmdb.org/t/p/w500//uXDfjJbdP4ijW...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844,https://image.tmdb.org/t/p/w500//6aGn2X51bahFo...
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602,https://image.tmdb.org/t/p/w500//1FSXpj5e8l4KH...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357,https://image.tmdb.org/t/p/w500//4uw6HKq4vlhrS...
4,5,Father of the Bride Part II (1995),Comedy,113041,11862,https://image.tmdb.org/t/p/w500//rj4LBtwQ0uGrp...


In [89]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 0 to 9741
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   movieId     9742 non-null   int64 
 1   title       9742 non-null   object
 2   genres      9742 non-null   object
 3   imdbId      9742 non-null   int64 
 4   tmdbId      9742 non-null   int64 
 5   poster_url  9742 non-null   object
dtypes: int64(3), object(3)
memory usage: 532.8+ KB


In [91]:
def get_overview(movie_id):
    if(movie_id == 0):
        return ""
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key=8265bd1679663a7ea12ac168da84d2e8&language=en-US"
    try:
        data = requests.get(url)
        data = data.json()
        if 'overview' in data:
            return data['overview']
        return ""
    except:
        return ""
get_overview(12509)

'In August of 1949, Life Magazine ran a banner headline that begged the question: "Jackson Pollock: Is he the greatest living painter in the United States?" The film is a look back into the life of an extraordinary man, a man who has fittingly been called "an artist dedicated to concealment, a celebrity who nobody knew." As he struggled with self-doubt, engaging in a lonely tug-of-war between needing to express himself and wanting to shut the world out, Pollock began a downward spiral.'

In [29]:
movies['overview'] = movies['tmdbId']
movies['overview'] = movies['overview'].apply(get_overview)

In [35]:
movies.head()

Unnamed: 0,movieId,imdbId,tmdbId,poster_url,overview
0,1,114709,862,https://image.tmdb.org/t/p/w500//uXDfjJbdP4ijW...,"Led by Woody, Andy's toys live happily in his ..."
1,2,113497,8844,https://image.tmdb.org/t/p/w500//6aGn2X51bahFo...,When siblings Judy and Peter discover an encha...
2,3,113228,15602,https://image.tmdb.org/t/p/w500//1FSXpj5e8l4KH...,A family wedding reignites the ancient feud be...
3,4,114885,31357,https://image.tmdb.org/t/p/w500//4uw6HKq4vlhrS...,"Cheated on, mistreated and stepped on, the wom..."
4,5,113041,11862,https://image.tmdb.org/t/p/w500//rj4LBtwQ0uGrp...,Just when George Banks has recovered from his ...


In [71]:
movies.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,poster_url
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,https://image.tmdb.org/t/p/w500//uXDfjJbdP4ijW...
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,https://image.tmdb.org/t/p/w500//6aGn2X51bahFo...
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,https://image.tmdb.org/t/p/w500//1FSXpj5e8l4KH...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,https://image.tmdb.org/t/p/w500//4uw6HKq4vlhrS...
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,https://image.tmdb.org/t/p/w500//rj4LBtwQ0uGrp...


In [97]:
movies.to_csv("dataset/moives_data_final", index=False)