Data Wrangling

In [2]:
import pandas as pd
import pyarrow.parquet as pq
import os

def read_parquet_files_to_dataframe(directory_path):
    # List all files in the directory
    files = [f for f in os.listdir(directory_path) if f.endswith('.parquet')]
    
    # Initialize an empty list to hold DataFrames
    dataframes = []
    
    # Loop through the files and read each one into a DataFrame
    for file in files:
        file_path = os.path.join(directory_path, file)
        parquet_file = pq.ParquetFile(file_path)
        df = parquet_file.read().to_pandas()
        dataframes.append(df)
    
    # Concatenate all the DataFrames into a single DataFrame
    merged_dataframe = pd.concat(dataframes, ignore_index=True)
    
    return merged_dataframe

# Call the function
merged_dataframe = read_parquet_files_to_dataframe('tmdb/')


In [7]:
pd.set_option("max_colwidth", None)
pd.set_option('display.max_columns', None)
merged_dataframe.head(1)


Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,Title,Year,Rated,Released,Runtime,Genre,Director,Writer,Actors,Plot,Language,Country,Awards,Poster,Ratings,Metascore,imdbRating,imdbVotes,imdbID,Type,DVD,BoxOffice,Production,Website,Response,belongs_to_collection.id,belongs_to_collection.name,belongs_to_collection.poster_path,belongs_to_collection.backdrop_path
0,False,/9Kg322bGsEbmp94LjCCVGz3cpNw.jpg,,10500000,"[{'id': 878, 'name': 'Science Fiction'}, {'id': 12, 'name': 'Adventure'}, {'id': 10751, 'name': 'Family'}, {'id': 14, 'name': 'Fantasy'}]",http://www.et20.com/,601,tt0083866,en,E.T. the Extra-Terrestrial,"An alien is left behind on Earth and saved by the 10-year-old Elliot who decides to keep him hidden in his home. While a task force hunts for the extra-terrestrial, Elliot, his brother, and his little sister Gertie form an emotional bond with their new friend, and try to help him find his way home.",45.478,/an0nD6uq6byfxXCfk6lQBzdL2J1.jpg,"[{'id': 33, 'logo_path': '/8lvHyhjr8oUKOOy2dKXoALWKdp0.png', 'name': 'Universal Pictures', 'origin_country': 'US'}, {'id': 56, 'logo_path': '/cEaxANEisCqeEoRvODv2dO1I0iI.png', 'name': 'Amblin Entertainment', 'origin_country': 'US'}]","[{'iso_3166_1': 'US', 'name': 'United States of America'}]",1982-06-11,792965500,115,"[{'english_name': 'English', 'iso_639_1': 'en', 'name': 'English'}]",Released,He is afraid. He is alone. He is three million light years from home.,E.T. the Extra-Terrestrial,False,7.512,10482,E.T. the Extra-Terrestrial,1982,PG,11 Jun 1982,115 min,"Adventure, Family, Sci-Fi",Steven Spielberg,Melissa Mathison,"Henry Thomas, Drew Barrymore, Peter Coyote",A troubled child summons the courage to help a friendly alien escape from Earth and return to his home planet.,English,United States,Won 4 Oscars. 52 wins & 36 nominations total,https://m.media-amazon.com/images/M/MV5BMTQ2ODFlMDAtNzdhOC00ZDYzLWE3YTMtNDU4ZGFmZmJmYTczXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg,"[{'Source': 'Internet Movie Database', 'Value': '7.9/10'}, {'Source': 'Rotten Tomatoes', 'Value': '99%'}, {'Source': 'Metacritic', 'Value': '92/100'}]",92,7.9,425802,tt0083866,movie,22 Jul 2015,"$437,141,279",,,True,,,,


In [69]:
def extract_rating(ratings, source):
    for rating in ratings:
        if rating['Source'] == source:
            return rating['Value']
    return None

df = merged_dataframe[['id', 'title', 'release_date', "original_language" , 'spoken_languages', 'genres',
                       'production_companies',  'production_countries', 'runtime', 'status', 'Rated', 'adult',
                       'budget', 'revenue', 'BoxOffice', 'popularity', 'vote_average', 'vote_count',
                         'Awards', 'Ratings', 'Metascore', 'imdbRating',
       'imdbVotes']].copy()

df['Rotten_Tomatoes_Rating'] = df['Ratings'].apply(lambda x: extract_rating(x, 'Rotten Tomatoes')) 
df['genres'] = df['genres'].apply(lambda x: ', '.join([d['name'] for d in x]))
df['production_companies'] = df['production_companies'].apply(lambda x: ', '.join([d['name'] for d in x]))
df['production_countries'] = df['production_countries'].apply(lambda x: ', '.join([d['iso_3166_1'] for d in x]))
df['spoken_languages'] = df['spoken_languages'].apply(lambda x: ', '.join([d['iso_639_1'] for d in x]))
df['release_month'] = pd.to_datetime(df['release_date']).dt.month
df['release_year'] = pd.to_datetime(df['release_date']).dt.year

df.drop(['release_date', 'Ratings'], axis=1, inplace=True)

# Standardize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')
df.rename(columns={"popularity": "tmdb_popularity", "vote_average": "tmdb_vote_average",
                     "vote_count": "tmdb_vote_count", 'metascore': 'metacritic_rating',
                     "imdbrating": "imdb_rating", 'imdbvotes': 'imdb_votes'}, inplace=True)

df['boxoffice'] = df['boxoffice'].str.replace('$', '')
df['boxoffice'] = df['boxoffice'].str.replace(',', '')
df['boxoffice'] = pd.to_numeric(df['boxoffice'], errors='coerce')

df['rotten_tomatoes_rating'] = df['rotten_tomatoes_rating'].str.replace('%', '')

df['imdb_rating'] = pd.to_numeric(df['imdb_rating'], errors='coerce')
df['imdb_votes'] = df['imdb_votes'].str.replace(',', '')
df['imdb_votes'] = pd.to_numeric(df['imdb_votes'], errors='coerce')
df['rotten_tomatoes_rating'] = pd.to_numeric(df['rotten_tomatoes_rating'], errors='coerce')
df['metacritic_rating'] = pd.to_numeric(df['metacritic_rating'], errors='coerce')


categorical_vars = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_vars:
    df[c] = df[c].str.lower()

df

Unnamed: 0,id,title,original_language,spoken_languages,genres,production_companies,production_countries,runtime,status,rated,adult,budget,revenue,boxoffice,tmdb_popularity,tmdb_vote_average,tmdb_vote_count,awards,metacritic_rating,imdb_rating,imdb_votes,rotten_tomatoes_rating,release_month,release_year
0,601,e.t. the extra-terrestrial,en,en,"science fiction, adventure, family, fantasy","universal pictures, amblin entertainment",us,115,released,pg,False,10500000,792965500,437141279.0,45.478,7.512,10482,won 4 oscars. 52 wins & 36 nominations total,92.0,7.9,425802.0,99.0,6,1982
1,602,independence day,en,en,"action, adventure, science fiction","20th century fox, centropolis entertainment",us,145,released,pg-13,False,75000000,817400891,306169268.0,40.066,6.869,8894,won 1 oscar. 35 wins & 35 nominations total,59.0,7.0,594628.0,68.0,6,1996
2,603,the matrix,en,en,"action, science fiction","village roadshow pictures, groucho ii film partnership, silver pictures, warner bros. pictures",us,136,released,r,False,63000000,463517383,172076928.0,69.459,8.206,23852,won 4 oscars. 42 wins & 52 nominations total,73.0,8.7,1992249.0,83.0,3,1999
3,604,the matrix reloaded,en,"en, fr","adventure, action, thriller, science fiction","village roadshow pictures, silver pictures, npv entertainment",us,138,released,r,False,150000000,738599701,281576461.0,41.368,7.040,10140,8 wins & 34 nominations,62.0,7.2,616809.0,74.0,5,2003
4,605,the matrix revolutions,en,"en, fr","adventure, action, thriller, science fiction","village roadshow pictures, npv entertainment, silver pictures",us,129,released,r,False,150000000,424988211,139313948.0,39.996,6.710,9129,5 wins & 36 nominations,47.0,6.7,531515.0,34.0,11,2003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1125,196,back to the future part iii,en,en,"adventure, comedy, science fiction","universal pictures, amblin entertainment",us,119,released,pg,False,40000000,244527583,88277583.0,31.724,7.500,9687,5 wins & 11 nominations,55.0,7.4,468495.0,80.0,5,1990
1126,197,braveheart,en,"en, fr, la, gd","action, drama, history, war","icon entertainment international, the ladd company, b.h. finance c.v.",us,177,released,r,False,72000000,213216216,75609945.0,42.167,7.933,9355,won 5 oscars. 34 wins & 34 nominations total,68.0,8.3,1069827.0,76.0,5,1995
1127,198,to be or not to be,en,"en, de","comedy, war",united artists,us,99,released,passed,False,1200000,1500000,3270000.0,11.815,7.889,630,nominated for 1 oscar. 2 wins & 2 nominations total,86.0,8.2,40902.0,96.0,3,1942
1128,199,star trek: first contact,en,en,"science fiction, action, adventure, thriller",paramount,us,111,released,pg-13,False,46000000,150000000,92027888.0,27.217,7.294,1593,nominated for 1 oscar. 8 wins & 21 nominations total,71.0,7.6,130183.0,93.0,11,1996


In [39]:
#value_counts = df["Production"].value_counts()
#value_counts
null_count = df['BoxOffice'].isnull().sum()
null_count

0

In [18]:
merged_dataframe.columns

Index(['adult', 'backdrop_path', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'Title', 'Year', 'Rated', 'Released',
       'Runtime', 'Genre', 'Director', 'Writer', 'Actors', 'Plot', 'Language',
       'Country', 'Awards', 'Poster', 'Ratings', 'Metascore', 'imdbRating',
       'imdbVotes', 'imdbID', 'Type', 'DVD', 'BoxOffice', 'Production',
       'Website', 'Response', 'belongs_to_collection.id',
       'belongs_to_collection.name', 'belongs_to_collection.poster_path',
       'belongs_to_collection.backdrop_path'],
      dtype='object')

In [70]:
df.dtypes

id                          int64
title                      object
original_language          object
spoken_languages           object
genres                     object
production_companies       object
production_countries       object
runtime                     int64
status                     object
rated                      object
adult                        bool
budget                      int64
revenue                     int64
boxoffice                 float64
tmdb_popularity           float64
tmdb_vote_average         float64
tmdb_vote_count             int64
awards                     object
metacritic_rating         float64
imdb_rating               float64
imdb_votes                float64
rotten_tomatoes_rating    float64
release_month               int32
release_year                int32
dtype: object