In [1]:
import pandas as pd
import numpy as py
import openai
import json
import os
import datetime

from pprint import pprint
from dotenv import load_dotenv

In [2]:
load_dotenv("auths.env")
openai_api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(api_key=openai_api_key)

In [3]:
def get_gpt_response(system_content, user_content):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content}
        ]
    )

    return completion.choices[0].message.content

In [4]:
def extract_awardwinners_info(start_year, end_year):
    systemcontent_extractor = """
    You are a data researcher and extractor, capable of browsing the web for data and delivering it in a semi-structured manner.
    """

    usercontent_bestactresses = f"""
    I'd like your help with data research and extraction. I want you to get data about the Oscar winners for Best Actress from {start_year} to {end_year}.

    For each winner, I'd like to know the following info:
    - ceremony_year: Year in which the ceremony was held
    - ceremony_number: Number of the ceremony
    - full_name: Full name
    - artistic_name: Artistic name
    - country_birth: Country she was born in
    - countries_citizenship: Country(s) she has citizenship to
    - role: Name of the role(s) she played
    - movie: Name of the movie(s) she was awarded for
    - release_year: Year in which the movie was released
    - main_production_country: Main country where said movie(s) were produced
    - all_production_countries: All countries involved in the production of the movie
    - main_language: Main language spoken in said movie(s)
    - all_languages: All languages credited as spoken in the movie(s)
    - source_ceremony: Source for the info on the ceremony (e.g. "https://en.wikipedia.org/wiki/94th_Academy_Awards")
    - source_actress: Source for the info on the actress (e.g. "https://en.wikipedia.org/wiki/Frances_McDormand")
    - source_movie: Source for the info on the movie (e.g. "https://en.wikipedia.org/wiki/Nomadland")

    Also, please follow these instructions:
    - Prioritize Wikipedia as the source from this data;
    - Return the data as a list of dictionaries that can be formatted as JSON;
    - Return ONLY the relevant and formatted data, without any text before or after it, not even "```json" or similar,
    - countries_citizenship, all_production_countries and all_languages should be returned as lists, even if they have only one item.
    """

    winners_info = json.loads(get_gpt_response(systemcontent_extractor, usercontent_bestactresses))

    ts_prefix = datetime.datetime.now().strftime('%Y%m%d') # -%H%M%S

    with open(f'best_actress_oscar_winners/raw_{start_year}-{end_year}.json', 'w', encoding='UTF-8') as f: #{ts_prefix}_
        json.dump(winners_info, f)

In [16]:
start_year = 1929
for i in range(1, 10):
    end_year = start_year + 10
    #extract_awardwinners_info(start_year, end_year)
    print(f"Extracted years {start_year} to {end_year}")
    start_year+=10

Extracted years 1929 to 1939
Extracted years 1939 to 1949
Extracted years 1949 to 1959
Extracted years 1959 to 1969
Extracted years 1969 to 1979
Extracted years 1979 to 1989
Extracted years 1989 to 1999
Extracted years 1999 to 2009
Extracted years 2009 to 2019


In [19]:
extract_awardwinners_info(2009, 2019)

In [21]:
fp_10y = 'best_actress_oscar_winners/every10years/'

winners_list = []
for file in os.listdir(fp_10y):
    with open(f'{fp_10y}{file}', 'r') as f:
        for winner in json.load(f):
            winners_list.append(winner)

In [22]:
df_bawinners_orig = pd.DataFrame.from_dict(winners_list)

In [23]:
df_bawinners_orig.iloc[:20]

Unnamed: 0,year,ceremony_number,full_name,artistic_name,country_birth,countries_citizenship,role,movie,release_year,main_production_country,all_production_countries,main_language,all_languages,source_ceremony,source_actress,source_movie
0,1929,1,Janet Gaynor,Janet Gaynor,United States,[United States],Multiple roles in films,"Seventh Heaven, Street Angel, Sunrise",1927,United States,[United States],Silent (with English intertitles),[English],https://en.wikipedia.org/wiki/1st_Academy_Awards,https://en.wikipedia.org/wiki/Janet_Gaynor,https://en.wikipedia.org/wiki/Seventh_Heaven_(...
1,1930,2,Mary Pickford,Mary Pickford,Canada,"[Canada, United States]",Multiple roles in films,Coquette,1929,United States,[United States],English,[English],https://en.wikipedia.org/wiki/2nd_Academy_Awards,https://en.wikipedia.org/wiki/Mary_Pickford,https://en.wikipedia.org/wiki/Coquette_(1929_f...
2,1931,3,Helen Hayes,Helen Hayes,United States,[United States],Mary Stuart,The Sin of Madelon Claudet,1931,United States,[United States],English,[English],https://en.wikipedia.org/wiki/3rd_Academy_Awards,https://en.wikipedia.org/wiki/Helen_Hayes,https://en.wikipedia.org/wiki/The_Sin_of_Madel...
3,1932,4,Marie Dressler,Marie Dressler,Canada,"[Canada, United States]",Mrs. Cora P. Hargreaves,Min and Bill,1930,United States,[United States],English,[English],https://en.wikipedia.org/wiki/4th_Academy_Awards,https://en.wikipedia.org/wiki/Marie_Dressler,https://en.wikipedia.org/wiki/Min_and_Bill
4,1933,5,Helen Hayes,Helen Hayes,United States,[United States],"Catherine in ""The Private Life of Henry VIII""",The Sin of Madelon Claudet,1931,United States,[United States],English,[English],https://en.wikipedia.org/wiki/5th_Academy_Awards,https://en.wikipedia.org/wiki/Helen_Hayes,https://en.wikipedia.org/wiki/The_Sin_of_Madel...
5,1934,6,Bette Davis,Bette Davis,United States,[United States],Judith Traherne,Of Human Bondage,1934,United States,[United States],English,[English],https://en.wikipedia.org/wiki/6th_Academy_Awards,https://en.wikipedia.org/wiki/Bette_Davis,https://en.wikipedia.org/wiki/Of_Human_Bondage...
6,1935,7,Hepburn,Katharine Hepburn,United States,[United States],Sabrina Fairchild,Sabrina,1935,United States,[United States],English,[English],https://en.wikipedia.org/wiki/7th_Academy_Awards,https://en.wikipedia.org/wiki/Katharine_Hepburn,https://en.wikipedia.org/wiki/Sabrina_(1954_film)
7,1936,8,Luise Rainer,Luise Rainer,Germany,"[Germany, United States]",Sophie Tucker,The Great Ziegfeld,1936,United States,[United States],English,[English],https://en.wikipedia.org/wiki/8th_Academy_Awards,https://en.wikipedia.org/wiki/Luise_Rainer,https://en.wikipedia.org/wiki/The_Great_Ziegfeld
8,1937,9,Tameka Banks,Tameka Banks,United States,[United States],Annabel,The Good Earth,1937,United States,[United States],English,[English],https://en.wikipedia.org/wiki/9th_Academy_Awards,https://en.wikipedia.org/wiki/Tameka_Banks,https://en.wikipedia.org/wiki/The_Good_Earth_(...
9,1938,10,Vivien Leigh,Vivien Leigh,India,"[United Kingdom, United States]",Scarlett O'Hara,Gone with the Wind,1939,United States,[United States],English,[English],https://en.wikipedia.org/wiki/10th_Academy_Awards,https://en.wikipedia.org/wiki/Vivien_Leigh,https://en.wikipedia.org/wiki/Gone_with_the_Wi...
