In [78]:
import pandas as pd
import numpy as np
import openai
import os

In [80]:
myapikey = open('my_gpt_apikey.txt').readline().rstrip()

client = openai.OpenAI(api_key = myapikey)

# Use GPT to gather data about the nyt authors

In [None]:
nyt_single = pd.read_csv("nyt_single_authors.csv")
nyt_single.head()

In [None]:
#make a list of all the authors
nyt_single_authors = nyt_single.author.unique()
nyt_single_authors_five = nyt_single_authors[:5]

In [37]:
def get_basic_demographics(my_author_list):
    df = pd.DataFrame(columns=['birth','death','race_eth','education', 'institution'], index= my_author_list)
    for author in my_author_list:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": """You are a literary historian. 
                You reply with only the requested information, not a complete sentence, and no headers. 
                If you do not know information, fill it in with None.
                If an author is still living, fill in the death year with N/A.
                For example, if the author is Chimamanda Ngozi Adichie, you would return: 1977, N/A, Nigerian, Master's Degree, Johns Hopkins University"""},
                {
                    "role": "user",
                    "content": """What is the birth year, death year, race/ethnicity, highest level of education, and 
                    college or university of """+author
                }
            ]
        )
        my_response = completion.choices[0].message.content
        #print(my_response)
        response_list = my_response.split(',')
        df.loc[author] = pd.Series({'birth':response_list[0],'death':response_list[1],'race_eth':response_list[2],'education':response_list[3], 'institution':response_list[4]})
        
    df = df.reset_index()
    df.rename(columns = {"index":"author"}, inplace = True)
    
    return(df)



In [45]:
def get_birthplace(my_author_list):
    df = pd.DataFrame(columns=['birthplace'], index= my_author_list)
    
    for author in my_author_list:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": """You are a literary historian. 
                You reply with only the requested information, not a complete sentence, and no headers. 
                Give the most detailed information you can find, including city, state, country
                If you do not know information, fill it in with None.
                For example, if the author is Pearl S. Buck, you would return: Hillsboro, WV, USA"""},
                {
                    "role": "user",
                    "content": """What is the birthplace of """+author
                }
            ]
        )
        my_response = completion.choices[0].message.content
        #print(my_response)
        df.loc[author] = pd.Series({'birthplace':my_response})

    df = df.reset_index()
    df.rename(columns = {"index":"author"}, inplace = True)    
    return df



## Get demographics for single authors

In [None]:
df = get_basic_demographics(nyt_single_authors)
print(df.head())

df.to_csv("nyt_single_author_demographics.csv")
os.system('say "your program has finished"')

In [None]:
df1 = get_birthplace(nyt_single_authors)
df1.to_csv("nyt_single_author_birthplace.csv")
print(df1.head())

os.system('say "your program has finished"')

## Get demographics for the fixed authors

In [11]:
nyt_reviewed = pd.read_csv("nyt_reviewed_authors.csv")
nyt_reviewed.head()

Unnamed: 0,author_x,birth,death,race_eth,education,institution,year,week,rank,title_id,title,author_y,problem,author_alt,author_1,author_2,_merge
0,,,,,,,1995.0,1995-12-31,14.0,6336.0,THE SMALL ONE,Alex Walsh. Illustrated by Jesse Clay,True,Alex Walsh,Alex Walsh,,right_only
1,,,,,,,2008.0,2008-03-02,15.0,974.0,CELEBUTANTES,Amanda Goldberg and Ruthanna Khalighi Hopper,True,Amanda Goldberg and Ruthanna Khalighi Hopper,Amanda Goldberg,Ruthanna Khalighi Hopper,right_only
2,,,,,,,2011.0,2011-01-23,9.0,6836.0,THREE SECONDS,Anders Roslund and Borge Hellstrom,True,Anders Roslund and Borge Hellstrom,Anders Roslund,Borge Hellstrom,right_only
3,,,,,,,2011.0,2011-01-30,8.0,6836.0,THREE SECONDS,Anders Roslund and Borge Hellstrom,True,Anders Roslund and Borge Hellstrom,Anders Roslund,Borge Hellstrom,right_only
4,,,,,,,2011.0,2011-02-06,15.0,6836.0,THREE SECONDS,Anders Roslund and Borge Hellstrom,True,Anders Roslund and Borge Hellstrom,Anders Roslund,Borge Hellstrom,right_only


In [17]:
nyt_reviewed_authors = nyt_reviewed.author_1.unique()
nyt_reviewed_authors

array(['Alex Walsh', 'Amanda Goldberg', 'Anders Roslund',
       'Armin L. Robinson', 'Arnaud de Borchgrave',
       'Arthur Quiller-Couch', 'Barry Lopez', 'Benedict Freedman',
       'Bethenny Frankel', 'Bill Adler', 'Bill Clinton', 'Brian Herbert',
       'Caroline Kennedy', 'Charles Nordhoff', 'Charmian Clift',
       'Clement Moore', 'Dennis Wheatley', 'Don Ferguson',
       'Dorothy Erskine', 'Emma McLaughlin', 'General Sir John Hackett',
       'Gideon Wyck', 'Glenn Beck with Kevin Balfe', 'Greer Hendricks',
       'Guillermo del Toro', 'Hannah Crafts', 'Isabel Bolton',
       'J. R. R. Tolkien', 'J.D. Gilman', 'Jamie Lee Curtis',
       'Janet Ahlberg', 'Janet Ahlberg Evanovich', 'Jim Carrey',
       'Joan D. Vinge', 'Joseph Auslander', 'Joyce Reardon',
       'Judith Reeves-Stevens', 'Julia London', 'Larry Niven',
       'Lucia Berlin', 'Marcie Walsh', 'Margaret Weis', 'Maria Shriver',
       'Marrijane Hayes', 'Marvin Kalb', 'Mary Ann Shaffer',
       'Mary S. Lovell', 'Michae

In [None]:
df_review = get_basic_demographics(nyt_reviewed_authors)
df_review.head()

In [None]:
df_review.head()

In [None]:
df_review.to_csv("nyt_reviewed_author_demographics.csv")

In [55]:
df1_review = get_birthplace(nyt_reviewed_authors[:5])
#os.system('say "your program has finished"')

In [57]:
print(df1_review.head())

                 author            birthplace
0            Alex Walsh                  None
1       Amanda Goldberg  Los Angeles, CA, USA
2        Anders Roslund     Stockholm, Sweden
3     Armin L. Robinson                  None
4  Arnaud de Borchgrave     Brussels, Belgium


In [59]:
df1_review.to_csv("nyt_reviewed_author_birthplace.csv")

# Add gender (cause I forgot)

In [74]:
nyt_demo_authors = pd.read_csv("authors_demographics_birthplace_NO_gender.csv")
nyt_demo_authors.head()

Unnamed: 0.1,Unnamed: 0,author,birth,death,race_eth,education,institution,birthplace,temp,country,state,city,_merge,prestigious,prestige
0,0,Warwick Deeping,1877.0,1950.0,English,,,"Ramsgate, Kent, England","['Ramsgate', ' Kent', ' England']",England,,Ramsgate,both,False,Other
1,1,Mazo de la Roche,1879.0,1961.0,Canadian,,,"Newmarket, Ontario, Canada","['Newmarket', ' Ontario', ' Canada']",Canada,,Newmarket,both,False,Other
2,2,Pearl S. Buck,1892.0,1973.0,Caucasian,Graduate,Cornell University,"Hillsboro, WV, USA","['Hillsboro', ' WV', ' USA']",USA,WV,Hillsboro,both,True,Top School
3,3,Willa Cather,1873.0,1947.0,American,Undergrad,University of Nebraska-Lincoln,"Back Creek Valley, Virginia, USA","['Back Creek Valley', ' Virginia', ' USA']",USA,VA,Back Creek Valley,both,False,Other
4,4,Rafael Sabatini,1875.0,1950.0,Italian,,,"Fermo, Marche, Italy","['Fermo', ' Marche', ' Italy']",Italy,,Fermo,both,False,Other


In [76]:
my_author_list2 = nyt_demo_authors.author.unique()
my_author_list2[:10]

array(['Warwick Deeping', 'Mazo de la Roche', 'Pearl S. Buck',
       'Willa Cather', 'Rafael Sabatini', 'John Galsworthy',
       'A. A. Milne', 'Edna Ferber', 'Clemance Dane', 'Oliver LaFarge'],
      dtype=object)

In [82]:
def get_gender(my_author_list):
    df = pd.DataFrame(columns=['gender'], index= my_author_list)
    
    for author in my_author_list:
        completion = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": """You are a literary historian. 
                You reply with only the requested information, not a complete sentence, and no headers.
                For example, if the author is Pearl S. Buck, you would return: female"""},
                {
                    "role": "user",
                    "content": """What is the gender of """+author
                }
            ]
        )
        my_response = completion.choices[0].message.content
        #print(my_response)
        df.loc[author] = pd.Series({'gender':my_response})

    df = df.reset_index()
    df.rename(columns = {"index":"author"}, inplace = True)    
    return df



In [84]:
df2 = get_gender(my_author_list2)
df2.head()

Unnamed: 0,author,gender
0,Warwick Deeping,male
1,Mazo de la Roche,female
2,Pearl S. Buck,female
3,Willa Cather,female
4,Rafael Sabatini,male


In [88]:
df2.gender.unique()

array(['male', 'female', 'unknown', 'transgender non-binary'],
      dtype=object)

In [86]:
df2.to_csv("authors_gender.csv")