# Data Cleaning and Processing
Clean up production countries added
- convert from json to csv
- get rid of movies with no production country
- standardize production country data


In [2]:
import json
import pandas as pd

In [3]:
#import production json
with open('Data/NetflixMovies_production_enriched.json', 'r') as f:
    production_data = json.load(f)

In [4]:
df = pd.DataFrame(production_data)
df.head()


Unnamed: 0,Unnamed: 1,searched,tconst,Title,Available Globally?,Release Date,Hours Viewed,Runtime,Views,Simple_Title,net_year,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,productionCountries,originCountry,overview
0,0,Back in Action,tt21191806,Back in Action,Yes,2025-01-17,313000000,1:54,164700000,Back in Action,2025.0,movie,Back in Action,Back in Action,2025,114,"Action,Comedy","[{'iso_3166_1': 'US', 'name': 'United States o...",[US],Fifteen years after vanishing from the CIA to ...
1,1,STRAW,tt32550101,STRAW,Yes,2025-06-06,185200000,1:48,102900000,STRAW,2025.0,movie,Straw,Straw,2025,105,"Drama,Thriller","[{'iso_3166_1': 'US', 'name': 'United States o...",[US],What will be her last straw? A devastatingly b...
2,2,The Life List,tt2172954,The Life List,Yes,2025-03-28,198900000,2:05,95500000,The Life List,2025.0,movie,The Life List,The Life List,2025,123,"Comedy,Drama,Romance","[{'iso_3166_1': 'US', 'name': 'United States o...",[US],When her mother sends her on a quest to comple...
3,3,Exterritorial,tt30876483,Exterritorial,Yes,2025-04-30,159000000,1:49,87500000,Exterritorial,2025.0,movie,Exterritorial,Exterritorial,2025,109,"Action,Mystery,Thriller","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...","[AT, DE]","When her son vanishes inside a US consulate, e..."
4,4,Havoc,tt14123284,Havoc,Yes,2025-04-25,154900000,1:47,86900000,Havoc,2025.0,movie,Havoc,Havoc,2025,107,"Action,Crime,Drama","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...","[US, GB]",When a drug heist swerves lethally out of cont...


In [5]:
df.iloc[0]["productionCountries"]
(df.iloc[0]["productionCountries"][0]["name"])

'United States of America'

In [6]:
df.iloc[3]["productionCountries"]

[{'iso_3166_1': 'DE', 'name': 'Germany'},
 {'iso_3166_1': 'AT', 'name': 'Austria'}]

productionCountries contains a list of 1 or more dictionaries. Each dictionary contains country name and country id (iso_3166_1)

In [7]:
#extract country code and country names from productionCountries, add to new columns
df['CountryName'] = df['productionCountries'].apply(lambda x: [country['name'] for country in x] if isinstance(x, list) else [])


In [8]:
df.head()

Unnamed: 0,Unnamed: 1,searched,tconst,Title,Available Globally?,Release Date,Hours Viewed,Runtime,Views,Simple_Title,...,titleType,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,productionCountries,originCountry,overview,CountryName
0,0,Back in Action,tt21191806,Back in Action,Yes,2025-01-17,313000000,1:54,164700000,Back in Action,...,movie,Back in Action,Back in Action,2025,114,"Action,Comedy","[{'iso_3166_1': 'US', 'name': 'United States o...",[US],Fifteen years after vanishing from the CIA to ...,[United States of America]
1,1,STRAW,tt32550101,STRAW,Yes,2025-06-06,185200000,1:48,102900000,STRAW,...,movie,Straw,Straw,2025,105,"Drama,Thriller","[{'iso_3166_1': 'US', 'name': 'United States o...",[US],What will be her last straw? A devastatingly b...,[United States of America]
2,2,The Life List,tt2172954,The Life List,Yes,2025-03-28,198900000,2:05,95500000,The Life List,...,movie,The Life List,The Life List,2025,123,"Comedy,Drama,Romance","[{'iso_3166_1': 'US', 'name': 'United States o...",[US],When her mother sends her on a quest to comple...,[United States of America]
3,3,Exterritorial,tt30876483,Exterritorial,Yes,2025-04-30,159000000,1:49,87500000,Exterritorial,...,movie,Exterritorial,Exterritorial,2025,109,"Action,Mystery,Thriller","[{'iso_3166_1': 'DE', 'name': 'Germany'}, {'is...","[AT, DE]","When her son vanishes inside a US consulate, e...","[Germany, Austria]"
4,4,Havoc,tt14123284,Havoc,Yes,2025-04-25,154900000,1:47,86900000,Havoc,...,movie,Havoc,Havoc,2025,107,"Action,Crime,Drama","[{'iso_3166_1': 'GB', 'name': 'United Kingdom'...","[US, GB]",When a drug heist swerves lethally out of cont...,"[United Kingdom, United States of America]"


In [15]:
#turn runtime into minutes
type(df.iloc[0]['Runtime'])

def runtime_to_minutes(runtime):
    if isinstance(runtime, str) and ':' in runtime:
        parts = runtime.split(':')
        hours = int(parts[0])
        minutes = int(parts[1])
        return hours * 60 + minutes
    return None
df['RuntimeMinutes'] = df['Runtime'].apply(runtime_to_minutes)


In [17]:
df.head()
type(df.iloc[0]['RuntimeMinutes'])

numpy.int64

In [18]:
#save df to csv
df.to_csv('Data/NetflixMovies_production_cleaned.csv', index=False)