# Importing packages

In [1]:
!pip install pandas
!pip install re
!pip install gender-guesser
!pip install genderize
!pip install matplotlib
!pip install random

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
[31mERROR: Could not find a version that satisfies the requirement re (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for re[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31

In [2]:
import os
import pandas as pd
import re
import gender_guesser.detector as gender
import matplotlib.pyplot as plt
import random

from genderize import Genderize, GenderizeException

# Set working directory

In [3]:
os.chdir("/work/VNV/Module 1 - Data acqusition and preparation")
print(os.getcwd())

# set a seed
random.seed(76)

/work/VNV/Module 1 - Data acqusition and preparation


# The Cornell Movie-Dialog Corpus
Link: https://www.kaggle.com/datasets/rajathmc/cornell-moviedialog-corpus

This corpus has several txt-files that need to be cleaned and combined in order to get the dataframe that we are interested in. Those marked in **bold** are the ones we are focusing on

movie_characters_metadata.txt contains:\
    - **characterID**\
    - **character name**\
    - movieID\
    - movie title\
    - **gender** ("?" for unlabeled cases)\
    - position in credits ("?" for unlabeled cases)

movie_lines.txt contains:\
    - lineID\
    - **characterID** (who uttered this phrase)\
    - movieID\
    - **character name**\
    - **text of the utterance**

movie_titles_metadata.txt contains:\
    - movieID\
    - movie title\
    - **movie year**\
    - IMDB rating\
    - no. IMDB votes\
    - genres in the format ['genre1','genre2','genreN']

## Movie_characters_metadata

### Cleaning the data

In [4]:
# Read the file lines
with open('/work/VNV/Module 1 - Data acqusition and preparation/data/movie_characters_metadata.txt', 'r', encoding='latin1') as file:
    lines = file.readlines()

# Split each line into columns based on the '+++$$+++' delimiter
split_lines = [line.strip().split(r'+++$+++') for line in lines]

# Ensure all rows have the expected number of columns
split_lines = [line for line in split_lines if len(line) == 6]

# Convert the list of split lines into a DataFrame
cornell_movie_meta_df = pd.DataFrame(split_lines, columns=['characterID', 'character_name', 'movieID', 'movie_title', 'gender', 'position_in_credits'])

# Display the first few rows of the DataFrame
print(cornell_movie_meta_df.head())

  characterID character_name movieID                   movie_title gender  \
0         u0         BIANCA      m0    10 things i hate about you      f    
1         u1          BRUCE      m0    10 things i hate about you      ?    
2         u2        CAMERON      m0    10 things i hate about you      m    
3         u3       CHASTITY      m0    10 things i hate about you      ?    
4         u4           JOEY      m0    10 things i hate about you      m    

  position_in_credits  
0                   4  
1                   ?  
2                   3  
3                   ?  
4                   6  


In [5]:
# Check the unique values for gender
print(cornell_movie_meta_df['gender'].unique())

# Strip leading and trailing spaces from the columns
cornell_movie_meta_df['gender'] = cornell_movie_meta_df['gender'].str.strip()
cornell_movie_meta_df['character_name'] = cornell_movie_meta_df['character_name'].str.strip()

# Normalize the character names by capitalizing the first letter of each word - VERY IMPORTANT FOR GENDER-GUESSER TO WORK 
cornell_movie_meta_df['character_name'] = cornell_movie_meta_df['character_name'].str.title()

# Replace uppercase variations of gender with lowercase 'f' and 'm' for the 'gender' column
cornell_movie_meta_df['gender'] = cornell_movie_meta_df['gender'].replace({'F': 'f', 'M': 'm'})

# Check the unique values again
print(cornell_movie_meta_df['gender'].unique())

# Save the cleaned dataFrame to a CSV file DO WE NEED?
cornell_movie_meta_df.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/data/clean_cornell.csv', index=False)

[' f ' ' ? ' ' m ' ' M ' ' F ']
['f' '?' 'm']


### Check if the composition of gendered/non-gendered is the same as the documentation states

In [6]:
# Check the unique values in gender
print(cornell_movie_meta_df['gender'].unique())

['f' '?' 'm']


In [7]:
print(cornell_movie_meta_df['gender'].value_counts())

gender
?    6020
m    2049
f     966
Name: count, dtype: int64


### The composition is not same - let's back up
3015 are gendered here, comapred to the 3774 that the documentation states should be gendered. Therefore we go back and try to check the composition from the txt-file while preprocessing it as little as possible. Although, the overall number of characters is in accordance with the literature, which indicates that 3015 is the correct number

In [8]:
# Read the file lines
with open('/work/VNV/Module 1 - Data acqusition and preparation/data/movie_characters_metadata.txt', 'r', encoding='latin1') as file:
    lines = file.readlines()

# Initialize a counter dictionary to count occurrences of different gender labels
gender_counts = {'f': 0, '?': 0, 'm': 0, 'M': 0, 'F': 0}

# Loop through each line and process the 5th input after '+++$$+++'
for line in lines:
    # Split the line by the delimiter
    parts = line.split('+++$+++')
    
    # Check if the line has at least 5 parts (5th input should exist)
    if len(parts) >= 5:
        gender = parts[4].strip()  # Gender is the 5th element (after 4th '+++$$+++')
        
        # Increment the corresponding gender count if it's in our dictionary
        if gender in gender_counts:
            gender_counts[gender] += 1

# Print the counts of each gender label
print("Gender counts:", gender_counts)


Gender counts: {'f': 921, '?': 6020, 'm': 1899, 'M': 150, 'F': 45}


**Based on this we determine that 3015 is the actual number of gendered characters.**

## Inferring gender using "gender-guesser" and "genderize"
Only 3,774 characters out of 9,035 have a gender annotation. To boost these numbers we want to infer gender of the character based on the name. We do this using both "gender-guessser" and "genderize" to hopefully boost accuracy.

Gender-guesser: https://pypi.org/project/gender-guesser/

Genderize: https://genderize.io/

### **Using Genderize**

In [9]:
# Trying out Genderize

# Defining our API key
API_key = "freebie-17c383ac-f296-4448-bb9f-2758878e0b2e"

# Initialize Genderize
genderize = Genderize(api_key=API_key)

# Test a few character names
test_names = ['Bianca', 'Bruce', 'Cameron', 'Chastity', 'Joey', 'Sam']

for name in test_names:
    result = genderize.get([name])
    print(f"Name: {name}, Genderize result: {result}")

Name: Bianca, Genderize result: [{'count': 222327, 'name': 'Bianca', 'gender': 'female', 'probability': 0.99}]
Name: Bruce, Genderize result: [{'count': 217022, 'name': 'Bruce', 'gender': 'male', 'probability': 1.0}]
Name: Cameron, Genderize result: [{'count': 35922, 'name': 'Cameron', 'gender': 'male', 'probability': 0.94}]
Name: Chastity, Genderize result: [{'count': 3666, 'name': 'Chastity', 'gender': 'female', 'probability': 1.0}]
Name: Joey, Genderize result: [{'count': 59100, 'name': 'Joey', 'gender': 'male', 'probability': 0.8}]
Name: Sam, Genderize result: [{'count': 645671, 'name': 'Sam', 'gender': 'male', 'probability': 0.9}]


#### Looping over the rows

In [10]:
# Create a copy of the original DataFrame to keep it intact
genderize_labels = cornell_movie_meta_df.copy()

# Defining our API key
API_key = "freebie-17c383ac-f296-4448-bb9f-2758878e0b2e"

# Initialize Genderize
genderize = Genderize(api_key=API_key)

# Initialize a counter to keep track of how many predictions are made
genderize_prediction_counter = 0

# Loop through each row and apply the logic
for index, row in genderize_labels.iterrows():
    try:
        # Get a prediction of gender from the Genderize API based on the character's name
        result = genderize.get([row['character_name']])
        
        # Only update gender if the probability of the prediction by Genderize is above 0.9
        if result and result[0].get('probability') > 0.7:
            
            # Map the gender to "m" for male and "f" for female
            predicted_gender = result[0]['gender']
            if predicted_gender == 'male':
                genderize_labels.at[index, 'genderize_gender'] = 'm'
                genderize_prediction_counter += 1  # Increment the counter
            elif predicted_gender == 'female':
                genderize_labels.at[index, 'genderize_gender'] = 'f'
                genderize_prediction_counter += 1  # Increment the counter
            else:
                genderize_labels.at[index, 'genderize_gender'] = '?'

    # If an error occurs, print the character name that caused it
    except GenderizeException as e:
        print(f"Error for {row['character_name']}: {e}")
        # Continue with the next row even if an error occurs
        continue

# Save the DataFrame to a CSV file
genderize_labels.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/output/genderize_labels.csv', index=False)

# Check that the result looks as expected
print(genderize_labels.head())
print(f"No. of predictions generated by Genderize: {genderize_prediction_counter}")

Error for ManS Voice: ("Invalid UTF8 in 'name' parameter", 422, {'Server': 'nginx/1.16.1', 'Date': 'Thu, 02 Jan 2025 15:54:20 GMT', 'Content-Type': 'application/json', 'Content-Length': '44', 'Connection': 'keep-alive', 'vary': 'accept-encoding', 'cache-control': 'max-age=0, private, must-revalidate', 'x-request-id': 'GBbrJ18D3SlNFaIfIzei', 'access-control-allow-credentials': 'true', 'access-control-allow-origin': '*', 'access-control-expose-headers': 'x-rate-limit-limit,x-rate-limit-remaining,x-rate-limit-reset'})
  characterID character_name movieID                   movie_title gender  \
0         u0          Bianca     m0    10 things i hate about you       f   
1         u1           Bruce     m0    10 things i hate about you       ?   
2         u2         Cameron     m0    10 things i hate about you       m   
3         u3        Chastity     m0    10 things i hate about you       ?   
4         u4            Joey     m0    10 things i hate about you       m   

  position_in_c

### Looking at how many characters are left with no gender label and what names they have

In [11]:
# Count the number of instances of f, m and ? in the original dataset
print(cornell_movie_meta_df['gender'].value_counts())

# Count the number of instances of f, m and ? in the Genderize modified dataset
print(genderize_labels['gender'].value_counts())

gender
?    6020
m    2049
f     966
Name: count, dtype: int64
gender
?    6020
m    2049
f     966
Name: count, dtype: int64


### Looking at the names that Genderize could not predict a gender for with 90% certainty

In [12]:
# Filter the DataFrame to get rows with missing gender
missing_genderizer_df = genderize_labels[genderize_labels['gender'] == '?']

# Display the first 50 names with missing gender
print(missing_genderizer_df['character_name'].head(50))

1                   Bruce
3                Chastity
8              Miss Perky
10                 Sharon
12                 Alonso
13                 Arojaz
14                Beatrix
15              Bobadilla
17               Fernando
18                 Isabel
20                 Mendez
21                 Moxica
22                 Pinzon
23                 Sailor
24                Sanchez
25                 Utapan
28                  Duffy
31                Hawkins
32                  Honey
33    Immigration Officer
36                   Leon
37                 Maggie
38                    Max
39                  Milos
40                 Mugger
43         Robert Hawkins
44                   Rose
45      Secondary Officer
46         Stephen Geller
47                  Tommy
48                 Vendor
49                    B20
51                    C12
52                  Child
53               Co-Pilot
54                  Elena
55                  Floyd
57               Michaels
58          

### **Using gender-guesser**

#### Trying gender-guesser out

In [13]:
import gender_guesser.detector as gender_detector

# Initialize the Gender-Guesser detector
d = gender_detector.Detector()

# Function to get gender for a list of names
def get_genders(names):
    for name in names:
        detected_gender = d.get_gender(name)
        print(f"Name: {name} - Detected Gender: {detected_gender}")

# Input names to test
names_to_test = ["Bruce", "Cameron", "Bianca", "Chastity", "Joey", "Sharon"]

# Get the gender predictions
get_genders(names_to_test)

Name: Bruce - Detected Gender: male
Name: Cameron - Detected Gender: mostly_male
Name: Bianca - Detected Gender: female
Name: Chastity - Detected Gender: female
Name: Joey - Detected Gender: male
Name: Sharon - Detected Gender: female


In [14]:
import gender_guesser.detector as gender_detector

# Create a copy of the original DataFrame to keep it intact
genderguesser_labels = cornell_movie_meta_df.copy()

# Initialize Gender-Guesser
d = gender_detector.Detector()

# Initialize a counter to keep track of how many predictions are made
genderguesser_prediction_counter = 0

# Loop through each row and apply the logic
for index, row in genderguesser_labels.iterrows():

    # Query the Gender-Guesser with the character's name
    detected_gender = d.get_gender(row['character_name'])
    
    # Adjust gender values: 'male' and 'mostly_male' to 'm', 'female' and 'mostly_female' to 'f'
    if detected_gender in ['male', 'mostly_male']:
        genderguesser_labels.at[index, 'genderguesser_gender'] = 'm'  # Set to 'm' for male
        genderguesser_prediction_counter += 1  # Increment the counter
    elif detected_gender in ['female', 'mostly_female']:
        genderguesser_labels.at[index, 'genderguesser_gender'] = 'f'  # Set to 'f' for female
        genderguesser_prediction_counter += 1  # Increment the counter
    elif detected_gender == 'unknown':
        genderguesser_labels.at[index, 'genderguesser_gender'] = '?'  # Leave '?' if unknown

# Save the DataFrame to a CSV file
genderguesser_labels.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/output/genderguesser_labels.csv', index=False)

# Check the result
print(genderguesser_labels.head())
print(f"No. of predictions generated by Gender-Guesser: {genderguesser_prediction_counter}")

  characterID character_name movieID                   movie_title gender  \
0         u0          Bianca     m0    10 things i hate about you       f   
1         u1           Bruce     m0    10 things i hate about you       ?   
2         u2         Cameron     m0    10 things i hate about you       m   
3         u3        Chastity     m0    10 things i hate about you       ?   
4         u4            Joey     m0    10 things i hate about you       m   

  position_in_credits genderguesser_gender  
0                   4                    f  
1                   ?                    m  
2                   3                    m  
3                   ?                    f  
4                   6                    m  
No. of predictions generated by Gender-Guesser: 4160


### Looking at how many characters are left with no gender label and what names they have

In [15]:
# Count the number of instances of f, m and ? in the original dataset
print(cornell_movie_meta_df['gender'].value_counts())

# Count the number of instances of f, m and ? in the Gender-Guesser modified dataset
print(genderguesser_labels['gender'].value_counts())

gender
?    6020
m    2049
f     966
Name: count, dtype: int64
gender
?    6020
m    2049
f     966
Name: count, dtype: int64


### Looking at the names that Gender-Guesser could not predict a gender for

In [16]:
# Filter the DataFrame to get rows with missing gender
missing_genderguesser_df = genderguesser_labels[genderguesser_labels['gender'] == '?']

# Display the first 50 names with missing gender
print(missing_genderguesser_df['character_name'].head(50))

1                   Bruce
3                Chastity
8              Miss Perky
10                 Sharon
12                 Alonso
13                 Arojaz
14                Beatrix
15              Bobadilla
17               Fernando
18                 Isabel
20                 Mendez
21                 Moxica
22                 Pinzon
23                 Sailor
24                Sanchez
25                 Utapan
28                  Duffy
31                Hawkins
32                  Honey
33    Immigration Officer
36                   Leon
37                 Maggie
38                    Max
39                  Milos
40                 Mugger
43         Robert Hawkins
44                   Rose
45      Secondary Officer
46         Stephen Geller
47                  Tommy
48                 Vendor
49                    B20
51                    C12
52                  Child
53               Co-Pilot
54                  Elena
55                  Floyd
57               Michaels
58          

### **Assessing the predictions from Genderize and GenderGuesser**

### Looking at the predictions for Genderize and Gender-Guesser

In [17]:
# Count the number of instances of f, m and ? in the original dataset
print(cornell_movie_meta_df['gender'].value_counts())

# Count the number of instances of f, m and ? in the Genderize modified dataset
print(genderize_labels['gender'].value_counts())

# Count the number of instances of f, m and ? in the Gender-Guesser modified dataset
print(genderguesser_labels['gender'].value_counts())

gender
?    6020
m    2049
f     966
Name: count, dtype: int64
gender
?    6020
m    2049
f     966
Name: count, dtype: int64
gender
?    6020
m    2049
f     966
Name: count, dtype: int64


So we see that Genderize predicted a gender for 1505 more characters than Gender-Guesser, of which 1266 were predicted to be male and 239 to be female

### Making a dataframe ready to investigate how Genderize and Gender-Guesser predicts

In [18]:
# Rename gender variables in genderize_labels and genderguesser_labels to reflect which method was used to assess gender DO WE NEED?
#genderguesser_labels = genderguesser_labels.rename(columns={'gender': 'genderguesser_gender'})
#genderize_labels = genderize_labels.rename(columns={'gender': 'genderize_gender'})

# Check that they are the same length (9035)
print(f"Lenght of genderguesser_labels: {len(genderguesser_labels)}")
print(f"Lenght of genderize_labels: {len(genderize_labels)}")

# Concatenate the DataFrames along columns (axis=1) based on index
cornell_movie_meta_compare_df = pd.concat(
    [cornell_movie_meta_df,  # Keep all columns from the original DataFrame
     genderize_labels[['genderize_gender']],  # Keep gender predictions from Genderize
     genderguesser_labels[['genderguesser_gender']]],  # Keep gender predictions from Gender-Guesser
    axis=1)

# Check the result and see that it looks good and the len is correct (it should be 9035)
print(cornell_movie_meta_compare_df.head())
print(f"Lenght of cornell_movie_meta_compare_df: {len(cornell_movie_meta_compare_df)}")

# Save the dataframe for later use
cornell_movie_meta_compare_df.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/output/cornell_all_three_gender_labels.csv', index=False)

Lenght of genderguesser_labels: 9035
Lenght of genderize_labels: 9035
  characterID character_name movieID                   movie_title gender  \
0         u0          Bianca     m0    10 things i hate about you       f   
1         u1           Bruce     m0    10 things i hate about you       ?   
2         u2         Cameron     m0    10 things i hate about you       m   
3         u3        Chastity     m0    10 things i hate about you       ?   
4         u4            Joey     m0    10 things i hate about you       m   

  position_in_credits genderize_gender genderguesser_gender  
0                   4                f                    f  
1                   ?                m                    m  
2                   3                m                    m  
3                   ?                f                    f  
4                   6                m                    m  
Lenght of cornell_movie_meta_compare_df: 9035


### Asessing the agreement between Genderize and Gender-Guesser

In [19]:
# Keep only the rows where there wasn't a gender label in the original dataframe
genderize_vs_genderguesser = cornell_movie_meta_compare_df[cornell_movie_meta_compare_df['gender'] == '?']

# Check result and see that the length is correct (6020 - since there were 3015 characters with gender and 9035 characters in total)
print(f"Lenght of genderize_vs_genderguesser after removing non-gender-annotated characters: {len(genderize_vs_genderguesser)}")

# Initialize lists to store agreement and disagreement
Agreement = []
Disagreement = []

# Iterate through each row in the DataFrame
for index, row in genderize_vs_genderguesser.iterrows():
    # Check if both 'genderize_gender' and 'genderguesser_gender' are not '?'
    if row['genderize_gender'] != '?' and row['genderguesser_gender'] != '?':
        # If they agree, append to Agreement
        if row['genderize_gender'] == row['genderguesser_gender']:
            Agreement.append(row['character_name'])
        # If they disagree, append to Disagreement with both predictions
        else:
            disagreement_details = {
                'character_name': row['character_name'],
                'genderize_gender': row['genderize_gender'],
                'genderguesser_gender': row['genderguesser_gender']
            }
            Disagreement.append(disagreement_details)

# Check the results
print(f"Agreement: {len(Agreement)}")
print(f"Disagreement: {len(Disagreement)}")

# Calculate the percentage for Agreement and Disagreement
Amount_of_guesses = len(Agreement)+len(Disagreement)
Agreement_percentage = (len(Agreement)/Amount_of_guesses)*100
disagreement_percentage = (len(Disagreement)/Amount_of_guesses)*100

print(f"Agreement percentage: {Agreement_percentage}")
print(f"Disagreement percentage: {disagreement_percentage}")

# Convert the Disagreement list into a DataFrame
disagreement_df = pd.DataFrame(Disagreement)

# Set pandas to display all rows in the DataFrame
pd.set_option('display.max_rows', None)

# Print the names they disagreed on
print(disagreement_df)

Lenght of genderize_vs_genderguesser after removing non-gender-annotated characters: 6020
Agreement: 2163
Disagreement: 196
Agreement percentage: 91.69139465875371
Disagreement percentage: 8.30860534124629
    character_name genderize_gender genderguesser_gender
0            Duffy              NaN                    m
1            Casey              NaN                  NaN
2            Carey              NaN                  NaN
3               Ma                f                  NaN
4              Len              NaN                    m
5           Bailey              NaN                    f
6              Doc                m                  NaN
7              Man                m                  NaN
8            Lenny              NaN                    m
9         Gillette              NaN                    f
10             Man                m                  NaN
11          Kendal              NaN                    f
12             Man                m                  

Here it seems like Genderize is more correct in its predictions

## Checking agreement between Genderize and Cornell

In [20]:
# Save the dataframe for later use
cornell_movie_meta_compare_df = pd.read_csv('/work/VNV/Module 1 - Data acqusition and preparation/output/cornell_all_three_gender_labels.csv')

In [21]:
# Make a copy of cornell_movie_meta_compare_df
cornell_vs_genderize = cornell_movie_meta_compare_df.copy()

# Keep only the rows where cornell already had gender labels
cornell_vs_genderize = cornell_vs_genderize[cornell_vs_genderize['gender'] != '?']

# Check the length of the df (should be 3015)
print(f"Number of character names: {len(cornell_vs_genderize)}")

# View the dataframe
print(cornell_vs_genderize.head(20))

Number of character names: 3015
   characterID character_name movieID                   movie_title gender  \
0          u0          Bianca     m0    10 things i hate about you       f   
2          u2         Cameron     m0    10 things i hate about you       m   
4          u4            Joey     m0    10 things i hate about you       m   
5          u5             Kat     m0    10 things i hate about you       f   
6          u6        Mandella     m0    10 things i hate about you       f   
7          u7         Michael     m0    10 things i hate about you       m   
9          u9         Patrick     m0    10 things i hate about you       m   
11        u11          Walter     m0    10 things i hate about you       m   
16        u16        Columbus     m1    1492: conquest of paradise       m   
19        u19        Marchena     m1    1492: conquest of paradise       m   
26        u26          Cutler     m2                    15 minutes       m   
27        u27          Daphne   

In [22]:
# Initialize lists to store agreement and disagreement
Agreement = []
Disagreement = []

# Iterate through each row in the DataFrame
for index, row in cornell_vs_genderize.iterrows():
    # If they agree, append to Agreement
    if row['genderize_gender'] == row['gender']:
        Agreement.append(row['character_name'])
    # If they disagree, append to Disagreement with both predictions
    else:
        disagreement_details = {
            'character_name': row['character_name'],
            'genderize_gender': row['genderize_gender'],
            'cornell_gender': row['gender']
        }
        Disagreement.append(disagreement_details)

# Check the results
print(f"Agreement: {len(Agreement)}")
print(f"Disagreement: {len(Disagreement)}")

# Calculate the percentage for Agreement and Disagreement
Amount_of_guesses = len(Agreement)+len(Disagreement)
Agreement_percentage = (len(Agreement)/Amount_of_guesses)*100
Disagreement_percentage = (len(Disagreement)/Amount_of_guesses)*100

print(f"Agreement percentage: {Agreement_percentage}")
print(f"Disagreement percentage: {Disagreement_percentage}")

# Convert the Disagreement list into a DataFrame
disagreement_df = pd.DataFrame(Disagreement)

# Print the names they disagreed on
print(disagreement_df.head(50))

Agreement: 2536
Disagreement: 479
Agreement percentage: 84.11276948590381
Disagreement percentage: 15.887230514096185
     character_name genderize_gender cornell_gender
0          Mandella                m              f
1            Korfin              NaN              m
2             Poole              NaN              m
3             Cates                f              m
4       Dino Velvet                f              m
5          Longdale              NaN              m
6              Reef              NaN              m
7         Lariviere              NaN              m
8          Twombley                f              m
9              Newt                m              f
10           Ripley              NaN              f
11        Cavalieri                m              f
12             Lorl              NaN              f
13      Schikaneder              NaN              m
14           Vogler              NaN              m
15             Alex                m              

So, that seems like a large disagreement, let's look at how many genderize was not able to predict for and how many it predicted wrong for

In [23]:
# Count occurrences of NaN in genderize_gender where cornell_gender is 'f'
nan_f_count = ((disagreement_df['genderize_gender'].isna()) & (disagreement_df['cornell_gender'] == 'f')).sum()

# Count occurrences of NaN in genderize_gender where cornell_gender is 'm'
nan_m_count = ((disagreement_df['genderize_gender'].isna()) & (disagreement_df['cornell_gender'] == 'm')).sum()

print(f"Number of NaN Genderize predictions when Cornell says 'f': {nan_f_count}")
print(f"Number of NaN Genderize predictions when Cornell says 'm': {nan_m_count}")
print(f"Number of NaN Genderize predictions when Cornell says 'f' or 'm: {nan_m_count + nan_f_count}")

Number of NaN Genderize predictions when Cornell says 'f': 81
Number of NaN Genderize predictions when Cornell says 'm': 248
Number of NaN Genderize predictions when Cornell says 'f' or 'm: 329


## Checking agreement between Gender-Guesser and Cornell

In [24]:
# Make a copy of cornell_movie_meta_compare_df
cornell_vs_genderguesser = cornell_movie_meta_compare_df.copy()

# Keep only the rows where cornell already had gender labels
cornell_vs_genderguesser = cornell_vs_genderguesser[cornell_vs_genderguesser['gender'] != '?']

# Check the length of the df (should be 3015)
print(f"Number of character names: {len(cornell_vs_genderguesser)}")

# View the dataframe
print(cornell_vs_genderguesser.head(20))

Number of character names: 3015
   characterID character_name movieID                   movie_title gender  \
0          u0          Bianca     m0    10 things i hate about you       f   
2          u2         Cameron     m0    10 things i hate about you       m   
4          u4            Joey     m0    10 things i hate about you       m   
5          u5             Kat     m0    10 things i hate about you       f   
6          u6        Mandella     m0    10 things i hate about you       f   
7          u7         Michael     m0    10 things i hate about you       m   
9          u9         Patrick     m0    10 things i hate about you       m   
11        u11          Walter     m0    10 things i hate about you       m   
16        u16        Columbus     m1    1492: conquest of paradise       m   
19        u19        Marchena     m1    1492: conquest of paradise       m   
26        u26          Cutler     m2                    15 minutes       m   
27        u27          Daphne   

### Checking their agreement

In [25]:
# Initialize lists to store agreement and disagreement
Agreement = []
Disagreement = []

# Iterate through each row in the DataFrame
for index, row in cornell_vs_genderguesser.iterrows():
    # If they agree, append to Agreement
    if row['genderguesser_gender'] == row['gender']:
        Agreement.append(row['character_name'])
    # If they disagree, append to Disagreement with both predictions
    else:
        disagreement_details = {
            'character_name': row['character_name'],
            'genderguesser_gender': row['genderguesser_gender'],
            'cornell_gender': row['gender']
        }
        Disagreement.append(disagreement_details)

# Check the results
print(f"Agreement: {len(Agreement)}")
print(f"Disagreement: {len(Disagreement)}")

# Calculate the percentage for Agreement and Disagreement
Amount_of_guesses = len(Agreement)+len(Disagreement)
Agreement_percentage = (len(Agreement)/Amount_of_guesses)*100
Disagreement_percentage = (len(Disagreement)/Amount_of_guesses)*100

print(f"Agreement percentage: {Agreement_percentage}")
print(f"Disagreement percentage: {Disagreement_percentage}")

# Convert the Disagreement list into a DataFrame
disagreement_genderguesser_df = pd.DataFrame(Disagreement)

# Print the names they disagreed on
print(disagreement_genderguesser_df.head(20))

Agreement: 1817
Disagreement: 1198
Agreement percentage: 60.26533996683251
Disagreement percentage: 39.73466003316749
   character_name genderguesser_gender cornell_gender
0        Mandella                    ?              f
1        Marchena                    ?              m
2          Cutler                    ?              m
3          Korfin                    ?              m
4          Bowman                    ?              m
5           Poole                    ?              m
6           Cates                    ?              m
7         Hammond                    ?              m
8          Korben                    ?              m
9          Leeloo                    ?              f
10    Dino Velvet                    ?              m
11       Longdale                    ?              m
12  Mrs Christian                    ?              m
13    Mrs Mathews                    ?              m
14         Welles                    ?              m
15           Reef 

So, that seems like a large disagreement, let's look at how many genderize was not able to predict for and how many it predicted wrong for

In [26]:
# Count occurrences of NaN in genderize_gender where cornell_gender is 'f'
nan_f_count = ((disagreement_genderguesser_df['genderguesser_gender'] == '?') & (disagreement_genderguesser_df['cornell_gender'] == 'f')).sum()

# Count occurrences of NaN in genderize_gender where cornell_gender is 'm'
nan_m_count = ((disagreement_genderguesser_df['genderguesser_gender'] == '?') & (disagreement_genderguesser_df['cornell_gender'] == 'm')).sum()

print(f"Number of NaN Genderize predictions when Cornell says 'f': {nan_f_count}")
print(f"Number of NaN Genderize predictions when Cornell says 'm': {nan_m_count}")
print(f"Number of NaN Genderize predictions when Cornell says 'f' or 'm: {nan_m_count + nan_f_count}")

Number of NaN Genderize predictions when Cornell says 'f': 173
Number of NaN Genderize predictions when Cornell says 'm': 904
Number of NaN Genderize predictions when Cornell says 'f' or 'm: 1077


## Movie_lines

### Clean the data

In [27]:
# Read the file lines
with open('/work/VNV/Module 1 - Data acqusition and preparation/data/movie_lines.txt', 'r', encoding='latin1') as file:
    lines = file.readlines()

# Split each line into columns based on the '+++$+++' delimiter
split_lines = [line.strip().split(r'+++$+++') for line in lines]

# Convert the list of split lines into a DataFrame
cornell_lines_df = pd.DataFrame(split_lines, columns=['lineID', 'characterID', 'movieID', 'character_name', 'text_of_the_utterance'])

# Save the DataFrame to a CSV file
cornell_lines_df.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/data/movie_lines_cleaned.csv', index=False)

# Display the first few rows of the DataFrame
print(cornell_lines_df.head())

   lineID characterID movieID character_name text_of_the_utterance
0  L1045          u0      m0         BIANCA           They do not!
1  L1044          u2      m0        CAMERON            They do to!
2   L985          u0      m0         BIANCA             I hope so.
3   L984          u2      m0        CAMERON              She okay?
4   L925          u0      m0         BIANCA              Let's go.


## Movie titles metadata

### Clean the data

In [28]:
# Read the file lines
with open('/work/VNV/Module 1 - Data acqusition and preparation/data/movie_titles_metadata.txt', 'r', encoding='latin1') as file:
    lines = file.readlines()

# Split each line into columns based on the '+++$+++' delimiter
split_lines = [line.strip().split(r'+++$+++') for line in lines]

# Remove '/I' from the 'movie_year' column
for line in split_lines:
    line[2] = line[2].replace("/I", "")  # Remove '/I' from the 'movie_year' column (3rd element)

# Convert the list of split lines into a DataFrame
cornell_movie_title_meta_df = pd.DataFrame(split_lines, columns=['movieID', 'movie_title', 'movie_year', 'IMDB_rating', 'no_IMDB_votes', 'genres'])

# Save the DataFrame to a CSV file
cornell_movie_title_meta_df.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/data/movie_title_meta_cleaned.csv', index=False)

# Display the first few rows of the DataFrame
print(cornell_movie_title_meta_df.head())

  movieID                   movie_title movie_year IMDB_rating no_IMDB_votes  \
0     m0    10 things i hate about you       1999        6.90         62847    
1     m1    1492: conquest of paradise       1992        6.20         10421    
2     m2                    15 minutes       2001        6.10         25854    
3     m3         2001: a space odyssey       1968        8.40        163227    
4     m4                       48 hrs.       1982        6.90         22289    

                                              genres  
0                              ['comedy', 'romance']  
1     ['adventure', 'biography', 'drama', 'history']  
2           ['action', 'crime', 'drama', 'thriller']  
3                 ['adventure', 'mystery', 'sci-fi']  
4   ['action', 'comedy', 'crime', 'drama', 'thril...  


### Checking how many movies are from each year + the year range

In [29]:
# Convert the 'movie_year' column to numeric values (integers)
cornell_movie_title_meta_df['movie_year'] = pd.to_numeric(cornell_movie_title_meta_df['movie_year'], errors='coerce')

# Create a new column for the decade (round down to nearest 10)
cornell_movie_title_meta_df['decade'] = (cornell_movie_title_meta_df['movie_year'] // 10) * 10

# Group by decade and count the number of movies
decade_counts = cornell_movie_title_meta_df.groupby('decade')['movie_year'].count()

# Convert the result into a DataFrame for better display
decade_counts_df = pd.DataFrame(decade_counts).reset_index()
decade_counts_df.columns = ['Decade', 'Movie Count']

# Display the table
print(decade_counts_df)

   Decade  Movie Count
0    1920            2
1    1930           16
2    1940           15
3    1950           17
4    1960           19
5    1970           51
6    1980          108
7    1990          244
8    2000          144
9    2010            1


# Combining columns from the different dataframes to make the final ones
movie_characters_metadata.txt contains:\
    - **characterID**\
    - **character name**\
    - movieID\
    - movie title\
    - **gender** ("?" for unlabeled cases)\
    - position in credits ("?" for unlabeled cases)

movie_lines.txt contains:\
    - lineID\
    - **characterID** (who uttered this phrase)\
    - movieID\
    - **character name**\
    - **text of the utterance**

movie_titles_metadata.txt contains:\
    - movieID\
    - movie title\
    - **movie year**\
    - IMDB rating\
    - no. IMDB votes\
    - genres in the format ['genre1','genre2','genreN']

## Loading in data frames

In [30]:
# Load in the cleaned datasets
movie_characters_original = pd.read_csv("/work/VNV/Module 1 - Data acqusition and preparation/data/clean_cornell.csv")
movie_lines = pd.read_csv("/work/VNV/Module 1 - Data acqusition and preparation/data/movie_lines_cleaned.csv")
movie_titles = pd.read_csv("/work/VNV/Module 1 - Data acqusition and preparation/data/movie_title_meta_cleaned.csv")

## Cleaning the dataframes

In [31]:
# Strip leading and trailing spaces from the 'gender' column
movie_characters_original['characterID'] = movie_characters_original['characterID'].str.strip()
movie_characters_original['character_name'] = movie_characters_original['character_name'].str.strip()
movie_characters_original['movieID'] = movie_characters_original['movieID'].str.strip()
movie_characters_original['gender'] = movie_characters_original['gender'].str.strip()

# Normalize the character names by capitalizing the first letter of each word - VERY IMPORTANT FOR GENDERIZE AND GENDER-GUESSER TO WORK -
movie_characters_original['character_name'] = movie_characters_original['character_name'].str.title()

movie_lines['characterID'] = movie_lines['characterID'].str.strip()
movie_lines['movieID'] = movie_lines['movieID'].str.strip()
movie_lines['character_name'] = movie_lines['character_name'].str.strip()
movie_lines['text_of_the_utterance'] = movie_lines['text_of_the_utterance'].str.strip()

movie_titles['movie_year'] = movie_titles['movie_year'].astype(str).str.strip()
movie_titles['movieID'] = movie_titles['movieID'].str.strip()

# Remove characters where a gender is not assigned
print(f"number of characters before cutting: {len(movie_characters_original)}")
movie_characters_original = movie_characters_original[movie_characters_original['gender'] != '?']
print(f"number of characters after cutting: {len(movie_characters_original)}")

number of characters before cutting: 9035
number of characters after cutting: 3015


### Add gender

In [32]:
movie_script_original_df = pd.merge(
    movie_lines,  # keep all the columns from movie_lines 
    movie_characters_original[['characterID', 'gender']],  # Select 'characterID' and 'gender' from movie_characters_original
    on='characterID',  # Merge on 'characterID'
    how='left'  # Use left join to keep all rows from movie_lines
)

print(movie_script_original_df.head())

# Checking to see if the length is still correct
print(f"Length of movie_lines: {len(movie_lines)}")
print(f"Length of movie_script_original_df: {len(movie_script_original_df)}")

   lineID characterID movieID character_name text_of_the_utterance gender
0  L1045           u0      m0         BIANCA          They do not!      f
1  L1044           u2      m0        CAMERON           They do to!      m
2   L985           u0      m0         BIANCA            I hope so.      f
3   L984           u2      m0        CAMERON             She okay?      m
4   L925           u0      m0         BIANCA             Let's go.      f
Length of movie_lines: 304713
Length of movie_script_original_df: 304713


### Add movie year

In [33]:
movie_script_original_df = pd.merge(
    movie_script_original_df,  # keep all the columns from movie_lines 
    movie_titles[['movieID', 'movie_year']],  # Select 'movieID' and 'movie_year' from movie_titles
    on='movieID',  # Merge on 'movieID'
    how='left'  # Use left join to keep all rows from movie_script_original_df
)

print(movie_script_original_df.head())

# Checking to see if the length is still correct
print(f"Length of movie_lines: {len(movie_lines)}")
print(f"Length of movie_script_original_df: {len(movie_script_original_df)}")

   lineID characterID movieID character_name text_of_the_utterance gender  \
0  L1045           u0      m0         BIANCA          They do not!      f   
1  L1044           u2      m0        CAMERON           They do to!      m   
2   L985           u0      m0         BIANCA            I hope so.      f   
3   L984           u2      m0        CAMERON             She okay?      m   
4   L925           u0      m0         BIANCA             Let's go.      f   

  movie_year  
0       1999  
1       1999  
2       1999  
3       1999  
4       1999  
Length of movie_lines: 304713
Length of movie_script_original_df: 304713


### Remove the lines from characters that do not have a gender assigned (as they are irrelevant)

In [34]:
# Check the unique values in gender
print(movie_script_original_df['gender'].unique())

# Remove dialogue lines from characters where a gender is not assigned
print(f"number of lines before cutting characters with no gender label: {len(movie_script_original_df)}")
movie_script_original_df = movie_script_original_df[movie_script_original_df['gender'].notna()]
print(f"number of lines after cutting characters with no gender label: {len(movie_script_original_df)}")

# Check the unique values in gender to check that only f and m are there
print(movie_script_original_df['gender'].unique())

# Remove any characters that have less than 5 lines of dialogue
print(f"number of lines before cutting <5: {len(movie_script_original_df)}")

line_counts = movie_script_original_df['characterID'].value_counts() # Count the number of lines for each character
characters_with_5_or_more_lines = line_counts[line_counts >= 5].index # Keep only characters with 5 or more lines
movie_script_original_df = movie_script_original_df[movie_script_original_df['characterID'].isin(characters_with_5_or_more_lines)] # Filter the movie_script_original_df to keep only rows where characterID is in the list of characters with 5+ lines

print(f"number of lines after cutting <5: {len(movie_script_original_df)}")

# Check the heading
print(movie_script_original_df.head())

['f' 'm' nan]
number of lines before cutting characters with no gender label: 304713
number of lines after cutting characters with no gender label: 242023
['f' 'm']
number of lines before cutting <5: 242023
number of lines after cutting <5: 242023
   lineID characterID movieID character_name text_of_the_utterance gender  \
0  L1045           u0      m0         BIANCA          They do not!      f   
1  L1044           u2      m0        CAMERON           They do to!      m   
2   L985           u0      m0         BIANCA            I hope so.      f   
3   L984           u2      m0        CAMERON             She okay?      m   
4   L925           u0      m0         BIANCA             Let's go.      f   

  movie_year  
0       1999  
1       1999  
2       1999  
3       1999  
4       1999  


### Check how many movies, characters, and dialogue lines are left + when the movies are from

In [35]:
print(f"Number of movies: {movie_script_original_df['movieID'].nunique()}")

print(f"Number of characters: {movie_script_original_df['characterID'].nunique()}")

print(f"Total number of dialog lines: {len(movie_script_original_df)}")

Number of movies: 600
Number of characters: 3015
Total number of dialog lines: 242023


In [36]:
# Convert the 'movie_year' column to numeric values (integers)
movie_script_original_df['movie_year'] = pd.to_numeric(movie_script_original_df['movie_year'], errors='coerce')

# Drop duplicate movies based on 'movieID'
unique_movies_df = movie_script_original_df.drop_duplicates(subset=['movieID'])

# Create a new column for the decade (round down to nearest 10)
unique_movies_df['decade'] = (unique_movies_df['movie_year'] // 10) * 10

# Group by decade and count the number of unique movies
decade_counts = unique_movies_df.groupby('decade')['movie_year'].count()

# Convert the result into a DataFrame for better display
decade_counts_df = pd.DataFrame(decade_counts).reset_index()
decade_counts_df.columns = ['Decade', 'Movie Count']

# Display the table
print(decade_counts_df)

   Decade  Movie Count
0    1920            1
1    1930           15
2    1940           15
3    1950           17
4    1960           19
5    1970           49
6    1980          105
7    1990          239
8    2000          139
9    2010            1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_movies_df['decade'] = (unique_movies_df['movie_year'] // 10) * 10


### Save the dataframe for future use

In [37]:
# Save the DataFrame to a CSV file
movie_script_original_df.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/output/movie_script_original_df', index=False)

## Dataframe with genders from the original + genderize

In [38]:
# Create a copy of the original DataFrame to keep it intact
cornell_movie_meta_genderize_df = cornell_movie_meta_df.copy()

# Defining our API key
API_key = "freebie-17c383ac-f296-4448-bb9f-2758878e0b2e"

# Initialize Genderize
genderize = Genderize(api_key=API_key)

# Initialize a counter to keep track of how many predictions are made
genderize_prediction_counter = 0

# Loop through each row and apply the logic
for index, row in cornell_movie_meta_genderize_df.iterrows():
    if row['gender'] == '?':  # Check if gender is missing
        try:
            # Get a prediction of gender from the Genderize API based on the character's name
            result = genderize.get([row['character_name']])
            
            # Only update gender if the probability of the prediction by Genderize is above 0.9
            if result and result[0].get('probability') > 0.9:
                
                # Map the gender to "m" for male and "f" for female
                predicted_gender = result[0]['gender']
                if predicted_gender == 'male':
                    cornell_movie_meta_genderize_df.at[index, 'gender'] = 'm'
                    genderize_prediction_counter += 1  # Increment the counter
                elif predicted_gender == 'female':
                    cornell_movie_meta_genderize_df.at[index, 'gender'] = 'f'
                    genderize_prediction_counter += 1  # Increment the counter

        # If an error occurs, print the character name that caused it
        except GenderizeException as e:
            print(f"Error for {row['character_name']}: {e}")
            # Continue with the next row even if an error occurs
            continue

# Save the DataFrame to a CSV file
cornell_movie_meta_genderize_df.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/output/updated_cornell_movie_meta_genderize.csv', index=False)

# Check the result
print(cornell_movie_meta_genderize_df.head())
print(genderize_prediction_counter)

Error for ManS Voice: ("Invalid UTF8 in 'name' parameter", 422, {'Server': 'nginx/1.16.1', 'Date': 'Thu, 02 Jan 2025 16:06:42 GMT', 'Content-Type': 'application/json', 'Content-Length': '44', 'Connection': 'keep-alive', 'vary': 'accept-encoding', 'cache-control': 'max-age=0, private, must-revalidate', 'x-request-id': 'GBbr0_eHxGIOMZEWHcHD', 'access-control-allow-credentials': 'true', 'access-control-allow-origin': '*', 'access-control-expose-headers': 'x-rate-limit-limit,x-rate-limit-remaining,x-rate-limit-reset'})
  characterID character_name movieID                   movie_title gender  \
0         u0          Bianca     m0    10 things i hate about you       f   
1         u1           Bruce     m0    10 things i hate about you       m   
2         u2         Cameron     m0    10 things i hate about you       m   
3         u3        Chastity     m0    10 things i hate about you       f   
4         u4            Joey     m0    10 things i hate about you       m   

  position_in_c

## Load in the data

In [39]:
movie_characters_genderize = pd.read_csv("/work/VNV/Module 1 - Data acqusition and preparation/output/updated_cornell_movie_meta_genderize.csv")
movie_lines = pd.read_csv("/work/VNV/Module 1 - Data acqusition and preparation/data/movie_lines_cleaned.csv")
movie_titles = pd.read_csv("/work/VNV/Module 1 - Data acqusition and preparation/data/movie_title_meta_cleaned.csv")

## Clean the dataframes

In [40]:
# Strip leading and trailing spaces from the 'gender' column
movie_characters_genderize['characterID'] = movie_characters_genderize['characterID'].str.strip()
movie_characters_genderize['character_name'] = movie_characters_genderize['character_name'].str.strip()
movie_characters_genderize['movieID'] = movie_characters_genderize['movieID'].str.strip()

movie_lines['characterID'] = movie_lines['characterID'].str.strip()
movie_lines['movieID'] = movie_lines['movieID'].str.strip()
movie_lines['character_name'] = movie_lines['character_name'].str.strip()
movie_lines['text_of_the_utterance'] = movie_lines['text_of_the_utterance'].str.strip()

movie_titles['movie_year'] = movie_titles['movie_year'].astype(str).str.strip()
movie_titles['movieID'] = movie_titles['movieID'].str.strip()

# Remove characters where a gender is not assigned
print(f"number of characters before cutting: {len(movie_characters_genderize)}")
movie_characters_genderize = movie_characters_genderize[movie_characters_genderize['gender'] != '?']
print(f"number of characters after cutting: {len(movie_characters_genderize)}")

number of characters before cutting: 9035
number of characters after cutting: 6776


### Add gender

In [41]:
movie_script_genderize_df = pd.merge(
    movie_lines,  # keep all the columns from movie_lines 
    movie_characters_genderize[['characterID', 'gender']],  # Select 'characterID' and 'gender' from movie_characters_genderize
    on='characterID',  # Merge on 'characterID'
    how='left'  # Use left join to keep all rows from movie_lines
)

print(movie_script_genderize_df.head())

# Checking to see if the length is still correct
print(f"Length of movie_lines: {len(movie_lines)}")
print(f"Length of movie_script_genderize_df: {len(movie_script_genderize_df)}")

   lineID characterID movieID character_name text_of_the_utterance gender
0  L1045           u0      m0         BIANCA          They do not!      f
1  L1044           u2      m0        CAMERON           They do to!      m
2   L985           u0      m0         BIANCA            I hope so.      f
3   L984           u2      m0        CAMERON             She okay?      m
4   L925           u0      m0         BIANCA             Let's go.      f
Length of movie_lines: 304713
Length of movie_script_genderize_df: 304713


### Add movie year

In [42]:
movie_script_genderize_df = pd.merge(
    movie_script_genderize_df,  # keep all the columns from movie_lines 
    movie_titles[['movieID', 'movie_year']],  # Select 'movieID' and 'movie_year' from movie_titles
    on='movieID',  # Merge on 'movieID'
    how='left'  # Use left join to keep all rows from movie_script_genderize_df
)

print(movie_script_genderize_df.head())

# Checking to see if the length is still correct
print(f"Length of movie_lines: {len(movie_lines)}")
print(f"Length of movie_script_genderize_df: {len(movie_script_genderize_df)}")

   lineID characterID movieID character_name text_of_the_utterance gender  \
0  L1045           u0      m0         BIANCA          They do not!      f   
1  L1044           u2      m0        CAMERON           They do to!      m   
2   L985           u0      m0         BIANCA            I hope so.      f   
3   L984           u2      m0        CAMERON             She okay?      m   
4   L925           u0      m0         BIANCA             Let's go.      f   

  movie_year  
0       1999  
1       1999  
2       1999  
3       1999  
4       1999  
Length of movie_lines: 304713
Length of movie_script_genderize_df: 304713


### Remove the lines from characters that do not have a gender assigned (as they are irrelevant)

In [43]:
# Check the unique values in gender
print(movie_script_genderize_df['gender'].unique())

# Remove dialogue lines from characters where a gender is not assigned
print(f"number of lines before cutting characters with no gender label: {len(movie_script_genderize_df)}")
movie_script_genderize_df = movie_script_genderize_df[movie_script_genderize_df['gender'].notna()]
print(f"number of lines after cutting characters with no gender label: {len(movie_script_genderize_df)}")

# Check the unique values in gender to check that only f and m are there
print(movie_script_genderize_df['gender'].unique())

# Remove any characters that have less than 5 lines of dialogue
print(f"number of lines before cutting <5: {len(movie_script_genderize_df)}")

line_counts = movie_script_genderize_df['characterID'].value_counts() # Count the number of lines for each character
characters_with_5_or_more_lines = line_counts[line_counts >= 5].index # Keep only characters with 5 or more lines
movie_script_genderize_df = movie_script_genderize_df[movie_script_genderize_df['characterID'].isin(characters_with_5_or_more_lines)] # Filter the movie_script_genderize_df to keep only rows where characterID is in the list of characters with 5+ lines

print(f"number of lines after cutting <5: {len(movie_script_genderize_df)}")

# Check the heading
print(movie_script_genderize_df.head())

['f' 'm' nan]
number of lines before cutting characters with no gender label: 304713
number of lines after cutting characters with no gender label: 281275
['f' 'm']
number of lines before cutting <5: 281275
number of lines after cutting <5: 278176
   lineID characterID movieID character_name text_of_the_utterance gender  \
0  L1045           u0      m0         BIANCA          They do not!      f   
1  L1044           u2      m0        CAMERON           They do to!      m   
2   L985           u0      m0         BIANCA            I hope so.      f   
3   L984           u2      m0        CAMERON             She okay?      m   
4   L925           u0      m0         BIANCA             Let's go.      f   

  movie_year  
0       1999  
1       1999  
2       1999  
3       1999  
4       1999  


### Check how many movies, characters, and dialogue lines are left + when the movies are from

In [44]:
print(f"Number of movies: {movie_script_genderize_df['movieID'].nunique()}")

print(f"Number of characters: {movie_script_genderize_df['characterID'].nunique()}")

print(f"Total number of dialog lines: {len(movie_script_genderize_df)}")

Number of movies: 616
Number of characters: 5929
Total number of dialog lines: 278176


In [45]:
# Convert the 'movie_year' column to numeric values (integers)
movie_script_genderize_df['movie_year'] = pd.to_numeric(movie_script_genderize_df['movie_year'], errors='coerce')

# Drop duplicate movies based on 'movieID'
unique_movies_df = movie_script_genderize_df.drop_duplicates(subset=['movieID'])

# Create a new column for the decade (round down to nearest 10)
unique_movies_df['decade'] = (unique_movies_df['movie_year'] // 10) * 10

# Group by decade and count the number of unique movies
decade_counts = unique_movies_df.groupby('decade')['movie_year'].count()

# Convert the result into a DataFrame for better display
decade_counts_df = pd.DataFrame(decade_counts).reset_index()
decade_counts_df.columns = ['Decade', 'Movie Count']

# Display the table
print(decade_counts_df)

   Decade  Movie Count
0    1920            2
1    1930           16
2    1940           15
3    1950           17
4    1960           19
5    1970           51
6    1980          108
7    1990          244
8    2000          143
9    2010            1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_movies_df['decade'] = (unique_movies_df['movie_year'] // 10) * 10


In [46]:
print(unique_movies_df.head())

      lineID characterID movieID character_name  \
0     L1045           u0      m0         BIANCA   
669   L2181          u12      m1         ALONSO   
942   L3546          u26      m2         CUTLER   
1614  L3778          u55      m3          FLOYD   
1886  L5048          u75      m4         LUTHER   

                                  text_of_the_utterance gender  movie_year  \
0                                          They do not!      f        1999   
669   Can't be that far, I say.  Also, I don't like ...      m        1992   
942   Officers, there's your killer, do your duty, a...      m        2001   
1614          We're trying to get there. I hope we can.      m        1968   
1886                                 Great, just great.      m        1982   

      decade  
0       1990  
669     1990  
942     2000  
1614    1960  
1886    1980  


In [47]:
# Create decade
# Convert the 'movie_year' column to numeric values (integers)
movie_script_genderize_df['movie_year'] = pd.to_numeric(movie_script_genderize_df['movie_year'], errors='coerce')

# Drop duplicate movies based on 'characterID'
unique_characters_df = movie_script_genderize_df.drop_duplicates(subset=['characterID'])

# Create a new column for the decade (round down to nearest 10)
unique_characters_df['decade'] = (unique_characters_df['movie_year'] // 10) * 10

# Group by decade and gender
decade_counts = unique_characters_df.groupby(['decade', 'gender']).size().reset_index(name='count')

# Pivot the table to separate female and male counts
decade_pivot = decade_counts.pivot(index='decade', columns='gender', values='count').reset_index()

# Rename columns for clarity
decade_pivot.columns = ['Decade', 'Female Characters', 'Male Characters']

# Convert counts to integers (so it says e.g., 22 and not 22.0)
decade_pivot['Female Characters'] = decade_pivot['Female Characters'].astype(int)
decade_pivot['Male Characters'] = decade_pivot['Male Characters'].astype(int)

# Display the table
print(decade_pivot)

   Decade  Female Characters  Male Characters
0    1920                  1                6
1    1930                 43              128
2    1940                 51              105
3    1950                 49               99
4    1960                 48              130
5    1970                132              365
6    1980                293              699
7    1990                704             1682
8    2000                462              921
9    2010                  3                8


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  unique_characters_df['decade'] = (unique_characters_df['movie_year'] // 10) * 10


### Save the dataframe for future use

In [48]:
# Save the DataFrame to a CSV file
movie_script_genderize_df.to_csv('/work/VNV/Module 1 - Data acqusition and preparation/output/movie_script_genderize_df.csv', index=False)