# Introduction
In this notebook, we perform data cleaning operations on several dataframes to extract character and actor first names from a dataset consisting of movies, cast information, and character details. The primary objective is to prepare the data for additional analysis.

## Import Libraries

In [77]:
import numpy as np
import pandas as pd
from ast import literal_eval
import ast

import warnings
warnings.filterwarnings('ignore')

## Data Loading
We start by loading the necessary datasets into pandas dataframes:

In [78]:
# This datasets were obtained from Kaggle: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/data?select=credits.csv
# The datasets contains metadata for all 45,000 movies listed in the Full MovieLens Dataset. 
# The datasets consists of movies released on or before July 2017.
credits_df = pd.read_csv("Data/credits.csv")
movies_df = pd.read_csv("Data/movies_metadata.csv")

### Define Function to Extract Character, Name, and Gender

We define a function extract_character_name_gender() to extract character, name, and gender values from valid entries.

In [79]:
# First, convert the string representation of the list in the 'cast' column 
# to an actual list of dictionaries using ast.literal_eval
credits_df['cast'] = credits_df['cast'].apply(ast.literal_eval)

# Define a function to extract character, name, and gender values from valid entries
def extract_character_name_gender(row):
    characters = []
    names = []
    genders = []
    if isinstance(row, list):  # Check if the entry is a list of dictionaries
        for entry in row:
            if 'character' in entry and 'name' in entry and 'gender' in entry:  # Check if the required keys are present
                characters.append(entry['character'])
                names.append(entry['name'])
                genders.append(entry['gender'])
    return characters, names, genders

### Apply Function to Extract Values

We apply the extract_character_name_gender() function to the 'cast' column to extract character, name, and gender values.

In [80]:
# Apply the function to the 'cast' column to extract character, name, and gender values
extracted_data = credits_df['cast'].apply(extract_character_name_gender)

### Create New Columns for Extracted Data
We create new columns in the DataFrame for the extracted data.

In [None]:
# Create new columns in the DataFrame for extracted data
extracted_df = pd.DataFrame(extracted_data.tolist(), columns=['characters', 'names', 'genders'], index=credits_df.index)
credits_df = pd.concat([credits_df, extracted_df], axis=1)

### Verify the Result
We verify that the 'characters', 'names', and 'genders' columns now contain the extracted values only from valid entries.

In [None]:
# Now you have 'characters', 'names', and 'genders' columns containing the extracted values only from valid entries
print(credits_df[['characters', 'names', 'genders']])

### Explode Columns

We explode the 'characters', 'names', and 'genders' columns to separate the list elements into individual rows.


In [None]:
# Explode the 'characters', 'names', and 'genders' columns
df_exploded = credits_df.explode(['characters', 'names', 'genders'])

### Reset Index
We reset the index of the DataFrame to retain the original 'id' value.

In [None]:
# Reset the index to retain the original 'id' value
df_exploded.reset_index(drop=True, inplace=True)

# Show dataframe
df_exploded.head()

### Save Extracted Data to CSV

We select the columns 'id', 'characters', 'names', and 'genders' from the exploded DataFrame `df_exploded` and save them to a new CSV file named `cast_and_characters.csv`.

In [None]:
# Get columns needed from df and write csv file
df_exploded[['id','characters', 'names', 'genders']].to_csv('Data/cast_and_characters.csv', header=True, index=False)

### Read New Dataset
We read the newly created CSV file cast_and_characters.csv into a DataFrame named cast_df.

In [None]:
# Read new dataset
cast_df = pd.read_csv("Data/cast_and_characters.csv")

### Convert 'id' Columns to Strings
We convert the 'id' columns in both cast_df and movies_df DataFrames to strings for consistency.

In [None]:
# Convert 'id' columns to strings
cast_df['id'] = cast_df['id'].astype(str)
movies_df['id'] = movies_df['id'].astype(str)

# Data Preprocessing
### Filtering English Movies
We filter the movies dataframe to consider only English language movies for analysis.

In [None]:
#filter movies that are only in English
english_movies_df = movies_df[movies_df['original_language'] == 'en']

### Merging Dataframes
We merge the cast dataframe with the filtered movies dataframe based on the movie ID.

In [None]:
# Merge the two DataFrames on the 'id' column
merged_df = pd.merge(cast_df, english_movies_df, on='id')

# Convert 'names', 'characters' and 'genders' columns to strings
merged_df['names'] = merged_df['names'].astype(str)
merged_df['characters'] = merged_df['characters'].astype(str)
merged_df['genders'] = merged_df['genders'].astype(str)

# Define dictionary to map the values
# Female = F; Male = M
gender_mapping = {'0.0': 'unknown', '1.0': 'F', '2.0': 'M'}

# Use map() function to apply the mapping to the 'genres' column
merged_df['genders'] = merged_df['genders'].map(gender_mapping)

#Show dataframe
merged_df.head()

### Extracting First Names
We define a function to extract first names from full names and apply it to create a new column in the dataframe.

In [None]:
# Function to extract first name
def extract_first_name(full_name):
    # Split the full name into parts
    name_parts = full_name.split()
    # Check if there is at least one part
    if len(name_parts) > 0:
        # Return the first part (first name)
        return name_parts[0]
    else:
        # If no first name can be extracted, return None
        return None

# Apply function to create new column
merged_df['actor_first_name'] = merged_df['names'].apply(lambda x: extract_first_name(x))

# Display the updated DataFrame
merged_df.head()

## Data Cleaning
We clean the data to remove irrelevant or redundant character names. We identify exact matches, specific words, and prefixes to be replaced or removed.

In [None]:
exact_matches_to_replace = [
    "Himself", "Herself", "Doctor", "Dancer", "Narrator", "Reporter", "Nurse","himself", 
    "(uncredited)", "Additional", "Policeman", "Bartender", "Waitress", "Mother", 
    "Priest", "(voice)", "Extra", "Detective", "Student", "Waiter", "Soldier",'uncredited', 
    "Guard", "Model", "Cop", "Executioner", "Auctioneer", "Teacher", "Father", 
    "Receptionist", "Man (uncredited)", "Woman (uncredited)", "Minor Role (uncredited)", 
    "Bit Part", "Bodyguard", "Indian", "Pedestrian", "Coroner", "Hostess", "Minor Role", 
    "Secretary", "Taxi Driver", "Captain", "Sergeant", "Driver", "Self", "The Doctor", 
    "Mayor", "Deputy", "Mom", "Cab Driver", "Maid", "Lawyer", "Preacher", "Paramedic", 
    "Bus Driver", "Journalist", "Doorman", "Bouncer", "Grandma", "Bailiff", "Janitor", 
    "Cashier", "Thug", "Technician", "Doc", "Cook", "Kid", "Mechanic", "Announcer", 
    "Debutante", "Truck Driver", "Librarian", "Cowboy", "Orderly", "News Anchor", 
    "Gas Station Attendant", "Baker", "Grandmother", "Actor", "Grandfather", "Principal", 
    "Desk Sergeant", "Gambler", "Prosecutor", "Jesus", "Juror", "Conductor", "Patient", 
    "Barman", "Chauffeur", "Surgeon", "Cheerleader", "Landlord", "Bit Role", "Warden", 
    "Cabbie", "Dad", "Prisoner", "Barfly", "Newscaster", "Bridesmaid", "Vicar", "Inspector", 
    "Bride", "Musician", "E.G.", "Undetermined Role", "Grandpa", "State Trooper", "Gangster", 
    "Fireman", "Party Goer", "Marine", "Biker", "Governor", "Little Boy", "Wife", "Bellboy", 
    "Townswoman", "Bunny", "Headmaster", "Hostage", "Radio Announcer", "Postman", "Chef", 
    "Gang Member", "Board Member", "Fighter", "Stripper", "Shopper", "Convict", "Pastor", 
    "Monster", "Valet", "Claude", "Engineer", "Tiny", "Concierge", "The President", 
    "Mailman", "Tour Guide", "Inmate", "Train Passenger", "Foreman", "Assistant", "Rabbi", 
    "Newsboy", "Trucker", "Nana", "Passenger", "Blonde", "Maitre D'", "Green", 
    "High School Student", "Resident", "Miner", "Attendant", "Trooper", "Dealer", 
    "Therapist", "Housekeeper", "The Boy", "-", "EMT", "Audience Member", "Bully", 
    "The Kid", "Mother Superior", "Train Conductor", "Servant", "Mall Shopper", 
    "Drummer", "Testimonial", "Jury Foreman", "The Judge", "Constable", "Showgirl", 
    "President", "The Stranger", "Wolf", "Rocker", "Jury Member", "Jock", "Innkeeper", 
    "Actress", "Tramp", "The General", "Groom", "Husband", "Cat", "Workman", "The Mayor", 
    "Daughter", "Chaplain", "Voice", "Limo Driver", "Reverend", "College Student", 
    "Bystander", "Granny", "Producer", "Lab Technician", "Prime Minister", 
    "Medical Examiner", "Bear", "Seaman", "Hairdresser", "Graduate Student", "O'Brien", 
    "Magistrate", "Pharmacist", "Mutant", "Cameraman", "Death Eater", "Warrior", 
    "Editor", "Sniper", "Terrorist", "Old Lady", "Q", "Comedian", "Undertaker", 
    "Chairman", "Pike", "Realtor", "Lefty", "Mum", "Elder", "Moose", "Steward",
]


words_to_replace = ['Additional', 'Extra', 'Narrarator','Narrator','Police', 'Cop', 'Worker','Teacher'
                   'Boy','Girl','Friend','Girl','Guard','Guest','Partier','Patron','Pilot','Player',
                   'Reporter','Sailor','Worker','Townsman','Courtroom','Photographer','Minor Role'
                   'Villager','Himself','Dancer','Neighbor','Villager','Soldier','Nurse','Farmer'
                   'Taxi','Townsperson','Woman','Man','Coach','Chief','(uncredited)','Mercenary'
                   'Schoolgirl','Clerk','Guy','Local','Salesman','Officer','Agent','Operator','Customer'
                   'Kid','King','Queen','Navigator','Set','Emcee','Official','Bank','Airline','Driver'
                   'Attendant','Hooker','Monk', 'Junior', 'Bishop', 'Stewardess','Referee','Henchman','Thug'
                   'Interviewer','Fisherman','Landlady', 'Psychiatrist','Interviewer','Interviewee','Principal'
                   'Grandfather','Tourist','Flight','Prostitute', 'Customer', 'Businessman', 'Director', 
                    'Scientist', 'Themselves','Singer','Attorney','Waitress',
                   'Dance','Team','Pirate','Corporal','(archive footage)', 'Butler', 'Choir', 'Zombie', '(voice)', 
                    'Priest', 'Messenger', 'Street', 'Dealer', 'Creature', 'Spider', 'Butcher', '#1', '#2', 'M', 
                    'Carpenter', 'ss', 'Detective', 'Gardener', 'Jailer', 'Blacksmith', 'Teller', 'Witch',
                   'herself','Pawnbroker','Ghost','Pimp','Claudius','Background actor','Colonel']

prefixes_to_remove = ['Mr.', 'Dr.','Mr', 'Mr', 'Ms', 'Ms.', 'Mrs', 'Mrs.', 'Mme', 'Mme.',
                   'Doctor', 'Dr.','Sheriff','Minister','Judge','Nun','Child','Sgt.','Sergeant',
                     'Sen.','Senator','Sr.','Senior','General','Col.','Colonel','Young','Lord',
                     'Prince','Princess','Farmer','Professor','Prof.','Pvt.','Lt.','Lieutenant',
                     'Gen.','General','Maj.','Major','The','Miss','Sir','Uncle','Det.','Deputy','Big',
                     'Sergeant','Herself','Inspector','Doctor','Sister','Little','Old','.','Female','First',
                     'Reverend','Principal','Private','Commander','Count','Doc','Admiral','Cpl.','Rev.','le',
                     'Cmdr.','Dr','Countess','Monsieur','Gen.','Ranger','Sen.','Pope','Professor']

# Function to clean names based on criteria
def clean_names(name):
    # Replace exact matches with NaN
    if name in exact_matches_to_replace:
        return np.nan
    
    # Replace names containing a certain word with NaN
    for word in words_to_replace:
        if word in name:
            return np.nan
    
    # Remove prefixes from names
    for prefix in prefixes_to_remove:
        if name.startswith(prefix):
            return name[len(prefix):].strip()
    
    return name

# Apply cleaning function to the characters column
merged_df['cleaned_character_names'] = merged_df['characters'].apply(lambda x: clean_names(x))

# Display the cleaned DataFrame
print(merged_df['cleaned_character_names'])

In [None]:
# Convert 'names' and 'characters' columns to strings
merged_df['cleaned_character_names'] = merged_df['cleaned_character_names'].astype(str)

# Apply function to create new column for character first names
merged_df['character_first_name'] = merged_df['cleaned_character_names'].apply(lambda x: extract_first_name(x))

# Convert the 'release_date' column to datetime
merged_df['release_date'] = pd.to_datetime(merged_df['release_date'])

In [None]:
merged_df.head(3)

In [None]:
print(merged_df[['title','release_date','actor_first_name','genders','character_first_name']])

### Save Extracted Data to CSV
We select the columns 'title', 'release_date', 'actor_first_name', 'genders', `character_first_name` from the DataFrame 'merged_df' and save them to a new CSV file named `cleaned_cast_and_characters.csv`.

In [None]:
# Get columns needed from df and write csv file
merged_df[['title','release_date','actor_first_name','genders','character_first_name']].to_csv('Data/cleaned_cast_and_characters.csv', header=True, index=False)