# Phase II: Data Curation, Exploratory Analysis and Plotting



## Imports

In [2]:
import requests
import pandas as pd

## Getting the Data

In [72]:
api_key = "03bcc17f7d105b13199e0325b659d4ab"
page = 1
url = f"https://api.themoviedb.org/3/movie/top_rated?api_key={api_key}&language=en-US&page={page}"

response = requests.get(url)
data = response.json()

df = pd.DataFrame(data['results'])

def get_languages():
    """
    This funciton gets the list of movie langauges from the TMDB.
    
    Returns:
        response (dict): The list of languages as a dictionary.
    """
    url = "https://api.themoviedb.org/3/configuration/languages"
    params = {"api_key": api_key}
    headers = {"accept": "application/json"}

    response = requests.get(url, headers=headers, params=params)
    return response.json()

def get_movie_genres():
    """
    This funciton gets the list of movie genres from the TMDB.
    
    Returns:
        response (dict): The list of genres as a dictionary.
    """
    url = "https://api.themoviedb.org/3/genre/movie/list?language=en"
    params = {"api_key": api_key}
    headers = {"accept": "application/json"}
    response = requests.get(url, headers=headers, params=params)

    return response.json()

df.head()


Unnamed: 0,adult,backdrop_path,genre_ids,id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count
0,False,/v8xVDqt8uCul3c3mgx4VpGCwxJC.jpg,"[18, 80]",278,en,The Shawshank Redemption,Imprisoned in the 1940s for the double murder ...,30.3534,/9cqNxx0GxF0bflZmeSMuL5tnGzr.jpg,1994-09-23,The Shawshank Redemption,False,8.713,29052
1,False,/jdHsptJbtalEuVhCV5i7kSC3g0x.jpg,"[18, 80]",238,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",31.2192,/3bhkrj58Vtu7enYsRolD1fZdja1.jpg,1972-03-14,The Godfather,False,8.686,21954
2,False,/kGzFbGhp99zva6oZODW5atUtnqi.jpg,"[18, 80]",240,en,The Godfather Part II,In the continuing saga of the Corleone crime f...,18.6726,/hek3koDUyRQk7FIhPXsa6mT2Zc3.jpg,1974-12-20,The Godfather Part II,False,8.572,13263
3,False,/zb6fM1CX41D9rF9hdgclu0peUmy.jpg,"[18, 36, 10752]",424,en,Schindler's List,The true story of how businessman Oskar Schind...,15.6155,/sF1U4EUQS8YHUYjNl3pMGNIQyr0.jpg,1993-12-15,Schindler's List,False,8.567,16785
4,False,/tj6iPnz18hGfr0LKqWmG6Cp3niO.jpg,[18],389,en,12 Angry Men,The defense and the prosecution have rested an...,13.6471,/ow3wq89wM8qd5X7hWKxiRfsFf9C.jpg,1957-04-10,12 Angry Men,False,8.5,9468


## Cleaning the Data

In [None]:
def clean_data(df):
    """
    Cleans the data given into these metrics: id, title, release year/month/day, popularity, language, genres, rating average, rating count. 

    Args:
        df (DataFrame): the inputted dataframe to be cleaned.

    Returns:
        cleaned_df (DataFrame): a cleaned dataframe.
    """
    
    #get the list of languages from TMDB
    languages_dict = get_languages()
    languages = {}
    for lang in languages_dict:
        languages[lang['iso_639_1']] = lang['english_name']
    
    #get the list of genres from TMDB
    genres_dict = get_movie_genres()
    genre_list = {}
    for genre in genres_dict['genres']:
        genre_list[genre['id']] = genre['name']

    #initialize lists to hold data
    movie_ids = []
    movie_titles = []
    movie_year = []
    movie_month = []
    movie_day = []
    movie_language = []
    movie_popularity = []
    movie_genres = []
    movie_vote_average = []
    movie_vote_count = []

    #loop through all rows of the data
    for data in df.iterrows():
        movie = data[1]

        movie_ids.append(movie['id']) #append movie id

        movie_titles.append(movie['title']) #append movie title

        #append movie release date
        movie_year.append(int(movie['release_date'][0:4])) 
        movie_month.append(int(movie['release_date'][5:7]))
        movie_day.append(int(movie['release_date'][8:10]))

        movie_language.append(languages[movie['original_language']]) #append movie language

        movie_popularity.append(float(movie['popularity'])) #append movie popularity

        #append movie genres as a list of genres
        genres = movie['genre_ids']
        list_of_genres = []
        for index in genres:
            list_of_genres.append(genre_list[index])
        movie_genres.append(list_of_genres)

        #append movie rating average and count
        movie_vote_average.append(float(movie['vote_average']))
        movie_vote_count.append(int(movie['vote_count']))

    #put all of the lists into a dictionary
    df_dict = {'movie_id': movie_ids,
               'title': movie_titles,
               'popularity': movie_popularity,
               'rating': movie_vote_average,
               'number_of_raters': movie_vote_count,
               'genres': movie_genres,
               'release_year': movie_year,
               'release_month': movie_month,
               'release_day': movie_day,
               'original_language': movie_language,
               }

    cleaned_df = pd.DataFrame(df_dict) #create the dataframe
    cleaned_df = cleaned_df.sort_values(by='movie_id') #sort the dataframe by movie_id (this isn't really necessary)
    return cleaned_df.reset_index(drop=True) #return the dataframe, and reset the index of the rows

cleaned_df = clean_data(df)
cleaned_df

Unnamed: 0,movie_id,title,popularity,rating,number_of_raters,genres,release_year,release_month,release_day,original_language
0,13,Forrest Gump,22.4576,8.464,28743,"[Comedy, Drama, Romance]",1994,6,23,English
1,122,The Lord of the Rings: The Return of the King,25.9816,8.49,25560,"[Adventure, Fantasy, Action]",2003,12,17,English
2,129,Spirited Away,20.8639,8.535,17566,"[Animation, Family, Fantasy]",2001,7,20,Japanese
3,155,The Dark Knight,33.3306,8.524,34561,"[Drama, Action, Crime, Thriller]",2008,7,16,English
4,238,The Godfather,31.2192,8.686,21954,"[Drama, Crime]",1972,3,14,English
5,240,The Godfather Part II,18.6726,8.572,13263,"[Drama, Crime]",1974,12,20,English
6,278,The Shawshank Redemption,30.3534,8.713,29052,"[Drama, Crime]",1994,9,23,English
7,346,Seven Samurai,8.7686,8.5,4006,"[Action, Drama]",1954,4,26,Japanese
8,389,12 Angry Men,13.6471,8.5,9468,[Drama],1957,4,10,English
9,424,Schindler's List,15.6155,8.567,16785,"[Drama, History, War]",1993,12,15,English


## Visualizations