In [11]:
import pandas as pd
import json

# Assuming `json_data` is the list of dictionaries you've provided.
file_path='coredb.json'
with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)

df = pd.DataFrame(data)

# 1. Remove unnecessary columns
columns_to_drop = ['_id', 'data_status', 'runtime', 'vote_average', 'vote_count', 'createdAt', 'updatedAt', 'trailer_id', 
                   'adult', 'budget', 'revenue', 'homepage', 'original_title', 
                   'poster_path', 'backdrop_path', 'release_status', 'imdb_id', '__v']
df = df.drop(columns=columns_to_drop)

# 2. Extract 'genres' as comma-separated names
df['genres'] = df['genres'].apply(lambda x: ', '.join([genre['name'] for genre in x]) if isinstance(x, list) else None)

# 3. Extract 'release_date' year
def extract_release_year(x):
    if isinstance(x, dict) and '$date' in x:
        return str(x['$date'])[:4]  # Extract the year portion
    return None

df['release_date'] = df['release_date'].apply(extract_release_year)

# 4. Extract director and cast names (combine them)
def extract_credits(credits):
    if isinstance(credits, list):
        directors = [person['name'] for person in credits if person['type'] == 'director']
        cast = [person['name'] for person in credits if person['type'] == 'cast']
        return ', '.join(directors + cast)  # Combine both director and cast names
    return None

df['credits'] = df['credits'].apply(extract_credits)

# 5. Extract 'keywords' as comma-separated names
df['keywords'] = df['keywords'].apply(lambda x: ', '.join([keyword['name'] for keyword in x]) if isinstance(x, list) else None)


# 6. Extract 'spoken_languages' as a comma-separated list of names
df['spoken_languages'] = df['spoken_languages'].apply(
    lambda x: ', '.join([language['name'] for language in x]) if isinstance(x, list) else None
)

# Save the cleaned DataFrame to CSV
output_file_path = 'cleaned_coredb.csv'
df.to_csv(output_file_path, index=False)

print(f"Data has been saved to {output_file_path}")

Data has been saved to cleaned_coredb.csv


In [7]:
# Print the dataframe to check the result
df.head()

Unnamed: 0,tmdb_id,media_type,title,overview,genres,release_date,tagline,credits,keywords
0,2,movie,Ariel,A Finnish man goes to the city to find a job a...,"Comedy, Drama, Romance, Crime",1988,,"Aki Kaurismäki, Tomi Salmela, Matti Pellonpää","prison, underdog, helsinki, finland, factory w..."
1,3,movie,Shadows in Paradise,"Nikander, a rubbish collector and would-be ent...","Comedy, Drama, Romance",1986,,"Aki Kaurismäki, Aki Kaurismäki, Matti Pellonpä...","helsinki, finland, salesclerk, garbage"
2,5,movie,Four Rooms,It's Ted the Bellhop's first night on the job....,Comedy,1995,Twelve outrageous guests. Four scandalous requ...,"Allison Anders, Alexandre Rockwell, Robert Rod...","hotel, new year's eve, witch, bet, sperm, hote..."
3,6,movie,Judgment Night,"Four young friends, while taking a shortcut en...","Action, Crime, Thriller",1993,Don't move. Don't whisper. Don't even breathe.,"Stephen Hopkins, Jeremy Piven, Stephen Dorff, ...","drug dealer, chicago, illinois, escape, one ni..."
4,8,movie,Life in Loops (A Megacities RMX),Timo Novotny labels his new project an experim...,Documentary,2006,A Megacities remix.,Timo Novotny,megacities
