In [112]:
import pandas as pd
import numpy as py
import openai
import json
import os
import datetime

from pprint import pprint
from dotenv import load_dotenv

In [25]:
load_dotenv("auths.env")
openai_api_key = os.getenv("OPENAI_API_KEY")
client = openai.OpenAI(api_key=openai_api_key)

In [49]:
def get_gpt_response(system_content, user_content):
    completion = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_content},
            {"role": "user", "content": user_content}
        ]
    )

    return completion.choices[0].message.content

In [75]:
def extract_awardwinners_info(start_year, end_year):
    systemcontent_extractor = """
    You are a data researcher and extractor, capable of browsing the web for data and delivering it in a semi-structured manner.
    """

    usercontent_bestactresses = f"""
    I'd like your help with data research and extraction. I want you to get data about the Oscar winners for Best Actress from {start_year} to {end_year}.

    For each winner, I'd like to know the following info:
    - year: Year of the awards
    - ceremony_number: Number of the ceremony
    - full_name: Full name
    - artistic_name: Artistic name
    - country_birth: Country she was born in
    - countries_citizenship: Country(s) she has citizenship to
    - role: Name of the role(s) she played
    - movie: Name of the movie(s) she was awarded for
    - release_year: Year in which the movie was released
    - main_production_country: Main country where said movie(s) were produced
    - all_production_countries: All countries involved in the production of the movie
    - main_language: Main language spoken in said movie(s)
    - all_languages: All languages credited as spoken in the movie(s)
    - source_ceremony: Source for the info on the ceremony (e.g. "https://en.wikipedia.org/wiki/94th_Academy_Awards")
    - source_actress: Source for the info on the actress (e.g. "https://en.wikipedia.org/wiki/Frances_McDormand")
    - source_movie: Source for the info on the movie (e.g. "https://en.wikipedia.org/wiki/Nomadland")

    Also, please follow these instructions:
    - Prioritize Wikipedia as the source from this data;
    - Return the data as a list of dictionaries that can be formatted as JSON;
    - Return ONLY the relevant and formatted data, without any text before or after it, not even "```json" or similar,
    - countries_citizenship, all_production_countries and all_languages should be returned as lists, even if they have only one item.
    """

    winners_info = json.loads(get_gpt_response(systemcontent_extractor, usercontent_bestactresses))

    ts_prefix = datetime.datetime.now().strftime('%Y%m%d') # -%H%M%S

    with open(f'best_actress_oscar_winners/raw_{start_year}-{end_year}.json', 'w', encoding='UTF-8') as f: #{ts_prefix}_
        json.dump(winners_info, f)

In [76]:
extract_awardwinners_info(1929, 1934)

In [105]:
start_year = 1929
for i in range(1, 20):
    end_year = start_year + 5
    extract_awardwinners_info(start_year, end_year)
    print(f"Extracted years {start_year} to {end_year}")
    start_year+=5

Extracted years 1929 to 1934
Extracted years 1934 to 1939
Extracted years 1939 to 1944
Extracted years 1944 to 1949
Extracted years 1949 to 1954
Extracted years 1954 to 1959
Extracted years 1959 to 1964
Extracted years 1964 to 1969
Extracted years 1969 to 1974
Extracted years 1974 to 1979
Extracted years 1979 to 1984
Extracted years 1984 to 1989
Extracted years 1989 to 1994
Extracted years 1994 to 1999
Extracted years 1999 to 2004
Extracted years 2004 to 2009
Extracted years 2009 to 2014
Extracted years 2014 to 2019
Extracted years 2019 to 2024


In [115]:
winners_list = []
for file in os.listdir('best_actress_oscar_winners'):
    with open(f'best_actress_oscar_winners/{file}', 'r') as f:
        for winner in json.load(f):
            winners_list.append(winner)

In [123]:
pd.DataFrame.from_dict(winners_list).iloc[-15:-7]

Unnamed: 0,year,ceremony_number,full_name,artistic_name,country_birth,countries_citizenship,role,movie,release_year,main_production_country,all_production_countries,main_language,all_languages,source_ceremony,source_actress,source_movie
104,2016,88,Brie Larson,Brie Larson,United States,[United States],Ma,Room,2015,Canada,"[Canada, Ireland, United Kingdom, United States]",English,[English],https://en.wikipedia.org/wiki/88th_Academy_Awards,https://en.wikipedia.org/wiki/Brie_Larson,https://en.wikipedia.org/wiki/Room_(2015_film)
105,2017,89,Emma Stone,Emma Stone,United States,[United States],Mia Dolan,La La Land,2016,United States,"[United States, France]",English,"[English, French]",https://en.wikipedia.org/wiki/89th_Academy_Awards,https://en.wikipedia.org/wiki/Emma_Stone,https://en.wikipedia.org/wiki/La_La_Land
106,2018,90,Frances McDormand,Frances McDormand,United States,[United States],Mildred Hayes,"Three Billboards Outside Ebbing, Missouri",2017,United States,[United States],English,[English],https://en.wikipedia.org/wiki/90th_Academy_Awards,https://en.wikipedia.org/wiki/Frances_McDormand,https://en.wikipedia.org/wiki/Three_Billboards...
107,2019,91,Olivia Colman,Olivia Colman,United Kingdom,[United Kingdom],"Anne, Queen of Great Britain",The Favourite,2018,United Kingdom,"[United Kingdom, United States, Ireland]",English,[English],https://en.wikipedia.org/wiki/91st_Academy_Awards,https://en.wikipedia.org/wiki/Olivia_Colman,https://en.wikipedia.org/wiki/The_Favourite
108,2019,91,Olivia Colman,Olivia Colman,United Kingdom,[United Kingdom],Queen Anne,The Favourite,2018,United Kingdom,"[United Kingdom, United States]",English,"[English, French, Italian]",https://en.wikipedia.org/wiki/91st_Academy_Awards,https://en.wikipedia.org/wiki/Olivia_Colman,https://en.wikipedia.org/wiki/The_Favourite
109,2020,92,Frances McDormand,Frances McDormand,United States,[United States],Fern,Nomadland,2020,United States,"[United States, China]",English,[English],https://en.wikipedia.org/wiki/92nd_Academy_Awards,https://en.wikipedia.org/wiki/Frances_McDormand,https://en.wikipedia.org/wiki/Nomadland
110,2021,93,Jessica Chastain,Jessica Chastain,United States,[United States],Tammy Faye Bakker,The Eyes of Tammy Faye,2021,United States,"[United States, Canada]",English,[English],https://en.wikipedia.org/wiki/93rd_Academy_Awards,https://en.wikipedia.org/wiki/Jessica_Chastain,https://en.wikipedia.org/wiki/The_Eyes_of_Tamm...
111,2022,94,Michelle Yeoh,Michelle Yeoh,Malaysia,"[Malaysia, United States]",Evelyn Wang,Everything Everywhere All at Once,2022,United States,[United States],English,"[English, Cantonese, Mandarin]",https://en.wikipedia.org/wiki/94th_Academy_Awards,https://en.wikipedia.org/wiki/Michelle_Yeoh,https://en.wikipedia.org/wiki/Everything_Every...


In [77]:
with open('best_actress_oscar_winners/raw_2020-2024.json', 'r') as f:
    filea = json.load(f)