# Inital Data Import and Extract:

In [None]:
# Dependencies
import pandas as pd
import json
# New library
import itertools 
# A new library just for fun!
import emoji 


In [None]:
#Retrieve all movies from csv
movies = pd.read_csv("Resources/tmdb_5000_movies.csv")
movies.head()

In [None]:
#Filter for english language movies only
#Some foreign language movies titles did not translate characters properly in the original list
en_movies = movies[movies["original_language"]=="en"]
en_movies.head()

In [None]:
#Create a list of movie ids
movie_id_list = en_movies['id'].tolist()


In [None]:
#retrieve all credits info (actors and other crew)
credits_orig = pd.read_csv("Resources/tmdb_5000_credits.csv")
credits_orig.head()

In [None]:
#Filter for movies in en_movies dataframe
credits= credits_orig[credits_orig['movie_id'].isin(movie_id_list)]

credits.head()

____________

# Transform:

# Create Genres Dataframe

In [None]:
# Create sets to hold ids and names
ids_names = set()


# Loop through genres rows and update sets
for row in en_movies['genres']:
    genres = json.loads(row)
    ids_names.update((i['id'], i['name']) for i in genres)


# Print ids and genre lists
for id, name in ids_names:
    print(f"{id}: {name}")

In [None]:
# Create Genres Dataframe
genres_df = pd.DataFrame(columns=['genre_id', 'genre'], data=(ids_names))

# Convert ids to integers
genres_df['genre_id']=genres_df['genre_id'].astype(int)

genres_df

#### Using emoji Library with Genres DF

In [None]:
# Add emojis to genres df
emoji_genres=genres_df.copy()
# Create a dictionary emoji aliases
emoji_dic = {'Horror':':face_screaming_in_fear:', 'Science Fiction':':alien:', 'Foreign':':globe_showing_europe_africa:', 'Family':':family_man_woman_girl_boy:', 'Action':':person_fencing:', 'Music':':musical_notes:', 'War':':military_medal:', 'Comedy':':rolling_on_the_floor_laughing:', 'Romance':':sparkling_heart:', 'Drama':':performing_arts:', 'Thriller':':bomb:', 'Adventure':':person_climbing:', 'TV Movie':':television:', 'Documentary':':video_camera:', 'Crime':':supervillain:', 'History':':classical_building:', 'Animation':':unicorn:', 'Fantasy':':troll:', 'Mystery':':detective:', 'Western':':cowboy_hat_face:'}
# Create a DF of emoji aliases
emoji_df= pd.DataFrame(list(emoji_dic.items()), columns=['genre', 'alias'])
# Merge DFs
emoji_genres=pd.merge(emoji_genres, emoji_df, on='genre', how='left')
# Create emojis
emoji_genres['genres_emoji'] = [emoji.emojize(x, language='alias') for x in emoji_genres['alias']]
# Save as csv
emoji_genres.to_csv('csv_outputs\emoji_genres.csv', index=False)

emoji_genres

#### Create dataframe for Movie Ids and Genre Ids

In [None]:
# create df of movie ids and genres
movie_genres = en_movies[["id", "genres"]]

# Create empty dataframe
movie_genreids_df = pd.DataFrame()

for index, row in movie_genres.iterrows():
    # Get all keywords into a list
    data = row['genres']
    data_list = json.loads(data)
        
    # Get movie id
    movie_id = row['id']
        
    # Get separate lists for id of keywords
    genre_id = [sub['id'] for sub in data_list]
    
    # Combine lists into data frame
    movieid_kws = pd.DataFrame({'movie_id': movie_id, 'genre_id': genre_id})
    
    # Concatenate to main DataFrame
    movie_genreids_df = pd.concat([movie_genreids_df, movieid_kws], ignore_index=True)

# Convert ids to integers
movie_genreids_df['genre_id']=movie_genreids_df['genre_id'].astype(int)

# Save as csv
movie_genreids_df.to_csv('csv_outputs\movieids_genreids.csv', index=False)
    
# Check first 25 rows of the DataFrame
movie_genreids_df.head(10)


# Create Cast Dataframe

In [None]:
#Testing for avatar (first movie)

#Index 0 is first row. getting full cast info
data = credits["cast"].loc[credits.index[0]]

data_list = json.loads(data)

five_list = data_list[:5]

actor_id = [ sub['id'] for sub in five_list ]
actor_name = [ sub['name'] for sub in five_list ]
movie_id = credits["movie_id"].loc[credits.index[0]]

five_df = pd.DataFrame({'movie_id':movie_id,'actor_id':actor_id,'name': actor_name})
five_df

In [None]:
# To get only unique ids in five_list:
# Actors may star as two or more characters in a single movie

credits_actor_name = pd.DataFrame()

for index, row in credits.iterrows():
    # Get all actors into a list
    data = row["cast"]
    data_list = json.loads(data)

    # Get movie id
    movie_id = row["movie_id"]

    # Initialize an empty set to keep track of actor IDs
    set_ids = set()

    # Initialize an empty list to store unique actor data
    unique_actors = []

    # Iterate through the cast data to get unique actors
    for actor in data_list:
        actor_id = actor['id']
        actor_name = actor['name']

        if actor_id not in set_ids:
            # Add actor_id
            set_ids.add(actor_id)
            # Append actor data to unique_actors
            unique_actors.append({'id': actor_id, 'name': actor_name})

        # Stop the loop if we have 5 unique actors
        if len(unique_actors) == 5:
            break

    # Get separate lists for id and names of actors
    actor_id = [actor['id'] for actor in unique_actors]
    actor_name = [actor['name'] for actor in unique_actors]

    # Create a DataFrame for the top 5 unique actors
    five_df = pd.DataFrame({'movie_id': movie_id, 'actor_id': actor_id, 'name': actor_name})

    # Concatenate the DataFrame to credits_actor_name
    credits_actor_name = pd.concat([credits_actor_name, five_df], ignore_index=True)

# Convert ids to integers
credits_actor_name['actor_id'] = credits_actor_name['actor_id'].astype(int)

# Check df
credits_actor_name.head(10)


#### Create Dataframe of Movie Ids and Actor Ids

In [None]:
#Create DF without names
credits_actor_df = credits_actor_name.drop(columns=['name'])

# Save as csv
credits_actor_df.to_csv('csv_outputs\credits_actor.csv', index=False)   

credits_actor_df.head(10)

#### Create Dataframe of Actors and Ids

In [None]:
# Create set to hold ids and actors
ids_cast = set()


# Loop through cast rows and update set
for index, row in credits_actor_name.iterrows():
    
    # Extract actor ID and name from the current row
    actor_id = row['actor_id']
    actor_name = row['name']
    
    # Add actor ID and name to the set
    ids_cast.add((actor_id, actor_name))

# Print first 10 in set of ids and actors
for actor_id, actor_name in itertools.islice(ids_cast, 10):
    print(f"{actor_id}: {actor_name}")

In [None]:
# Create Actors Dataframe
cast_df = pd.DataFrame(columns=['actor_id', 'actor'], data=(ids_cast))

# Save as csv
cast_df.to_csv('csv_outputs\\actors.csv', index=False)

cast_df.head(10)


# Create Directors Dataframe

In [None]:
# Create set to hold ids and directors
ids_director = set()

# Loop through crew rows and update set
for row in credits['crew']:
    crew = json.loads(row)
         
# Loop through crew members
    for crew_member in crew:
    # Find members with 'job':'director'
        if crew_member.get('job') == "Director":
        # If Director, update set
            ids_director.add((crew_member['id'], crew_member['name']))


# Print the first 10 in set containing director ids and names
for id, name in itertools.islice(ids_director, 10):
    print(f"{id}: {name}")

In [None]:
# Create Director Dataframe
director_df = pd.DataFrame(columns=['director_id', 'director'], data=(ids_director))

# Convert ids to integers
director_df['director_id']=director_df['director_id'].astype(int)

# Save as csv
director_df.to_csv('csv_outputs\directors.csv', index=False)

director_df.head(10)

### Create a Dataframe for Movie Ids and Director Ids 

In [None]:
# create df of movie ids and crew
movie_crew = credits[["movie_id", "crew"]]

# Create empty dataframe
movieid_directorid_df = pd.DataFrame()

for index, row in movie_crew.iterrows():
    # Get all keywords into a list
    data = row['crew']
    data_list = json.loads(data)

    # Get movie id
    movie_id = row['movie_id']

    # Empty list to store director IDs
    director_ids = []

    for crew_member in data_list:
    # Find members with 'job':'director'
        if crew_member.get('job') == "Director":    
    
            # Get separate lists for id of directors
            director_ids.append(crew_member.get('id'))

   # Create a DataFrame for the current movie ID and director IDs
    movie_director = pd.DataFrame({'movie_id': [movie_id] * len(director_ids), 'director_id': director_ids})
    
    # Concatenate to main DataFrame
    movieid_directorid_df = pd.concat([movieid_directorid_df, movie_director], ignore_index=True)

# Convert ids to integers
movieid_directorid_df['movie_id']=movieid_directorid_df['movie_id'].astype(int)
movieid_directorid_df['director_id']=movieid_directorid_df['director_id'].astype(int)

# Save as csv
movieid_directorid_df.to_csv('csv_outputs\movieids_director_ids.csv', index=False)
    
# Check first 25 rows of the DataFrame
movieid_directorid_df.head(10)


# Create Keyword Dataframe

In [None]:
# Create sets to hold ids and names
ids_keyword = set()

# Loop through genres rows and update sets
for row in en_movies['keywords']:
    keywords = json.loads(row)
    ids_keyword.update((i['id'], i['name']) for i in keywords)

# Print the first 10 in set containing keyword ids and keywords
for id, keyword in itertools.islice(ids_keyword, 10):
    print(f"{id}: {keyword}")

In [None]:
# Create keywords Dataframe
keywords_df = pd.DataFrame(columns=['kw_id', 'keyword'], data=(ids_keyword))

# Convert ids to integers
keywords_df['kw_id']=keywords_df['kw_id'].astype(int)

# Save as csv
keywords_df.to_csv('csv_outputs\keywords.csv', index=False)

keywords_df.head(10)

### Create Dataframe for Movie Ids and Keyword Ids

In [None]:
# create df of movie ids and keywords
movie_keywords = en_movies[["id", "keywords"]]

# Create empty dataframe
movie_keywords_df = pd.DataFrame()

for index, row in movie_keywords.iterrows():
    # Get all keywords into a list
    data = row['keywords']
    data_list = json.loads(data)
        
    # Get movie id
    movie_id = row['id']
        
    # Get separate lists for id of keywords
    kw_id = [sub['id'] for sub in data_list]
    
    # Combine lists into data frame
    movieid_kws = pd.DataFrame({'movie_id': movie_id, 'kw_id': kw_id})
    
    # Concatenate to main DataFrame
    movie_keywords_df = pd.concat([movie_keywords_df, movieid_kws], ignore_index=True)

# Convert ids to integers
movie_keywords_df['kw_id']=movie_keywords_df['kw_id'].astype(int)

# Save as csv
movie_keywords_df.to_csv('csv_outputs\movieids_kwids.csv', index=False)
    
# Check first 25 rows of the DataFrame
movie_keywords_df.head(25)

# Create Movies Dataframe

In [None]:
# Identify which columns are needed
data = {'movie_id':en_movies['id'], 
        'title':en_movies['title'], 
        'release_date_str':en_movies['release_date'], 
        'revenue':en_movies['revenue'], 
        'tagline':en_movies['tagline'], 
        'average_vote':en_movies['vote_average'],
        'popularity':en_movies['popularity']}

# Create Director Dataframe
movies_df = pd.DataFrame(data=(data))

# Convert ids to integers
movies_df['movie_id']=movies_df['movie_id'].astype(int)

# Fill null values in tagline and release_date
movies_df['tagline'] = movies_df['tagline'] .fillna('*No Tag Line*')
movies_df['release_date_str'] = movies_df['release_date_str'] .fillna(0000-00-00)

# Convert release date column to datetime type
movies_df['release_date'] = pd.to_datetime(movies_df['release_date_str'], format='mixed')

# Drop the original release date column
movies_df.drop(columns=['release_date_str'], inplace=True)

# Save as csv
movies_df.to_csv('csv_outputs\movies.csv', index=False)

movies_df.head(20)

______________