In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# Suppress the warning for this file
# ok, this is likely a bad idea, but there's going to be a lot of these
pd.options.mode.chained_assignment = None

In [3]:
nyt = pd.read_csv("nyt_full.tsv", sep = '\t')
nyt.head()

Unnamed: 0,year,week,rank,title_id,title,author
0,1931,1931-10-12,1,6477,THE TEN COMMANDMENTS,Warwick Deeping
1,1931,1931-10-12,2,1808,FINCHE'S FORTUNE,Mazo de la Roche
2,1931,1931-10-12,3,5304,THE GOOD EARTH,Pearl S. Buck
3,1931,1931-10-12,4,4038,SHADOWS ON THE ROCK,Willa Cather
4,1931,1931-10-12,5,3946,SCARMOUCHE THE KING MAKER,Rafael Sabatini


In [4]:
nyt.author.nunique()

2210

In [5]:
print(nyt.author)

0                        Warwick Deeping
1                       Mazo de la Roche
2                          Pearl S. Buck
3                           Willa Cather
4                        Rafael Sabatini
                      ...               
60381                             Halsey
60382                       Brit Bennett
60383                        Delia Owens
60384                    Fredrik Backman
60385    Clive Cussler and Boyd Morrison
Name: author, Length: 60386, dtype: object


In [6]:
# replace written & illustrated by 
nyt.author = nyt.author.replace("written and illustrated by ","", regex = True)

## separate complicated entries

In [8]:
#if there is a preposition, there's complication
nyt.author = nyt.author.replace(" And "," and ", regex = True)
nyt['problem'] = nyt.author.str.contains(' and | with | by ', regex = True, case = False)
print("how many problem entries", nyt.problem.sum())
print(nyt.problem.sum()/len(nyt))

# new dataframe with just problems
nyt_review = nyt[nyt.problem == True]
print("how many unique problems", len(nyt_review.author.unique()))

#new dataframe without problems 
nyt_single = nyt[nyt.problem == False]
nyt_single.to_csv('nyt_single_authors.csv', index=False)

how many problem entries 2416
0.04000927367270559
how many unique problems 189


In [9]:
# if there's an author with editor, remove editor, else treat editor as author 
nyt_review.author = nyt_review.author.replace(", edited by ",". Edited by ", regex = True)

#remove 'edited by ' and transfer rest of string to the 'author_alt' column
def resolve_editors(name):
    if 'dited ' in name:
        if name[:10] == 'edited by ':
            return name[10:]
        elif name[:10] != 'edited by ':
            substring = ('. Edited by ')
            return name.split(substring)[0]
    else: return name

nyt_review['author_alt'] = nyt_review.apply(lambda x: resolve_editors(x['author']), axis = 1)

In [10]:
print(nyt_review['author_alt'].unique()[:5])

['Charles Nordhoff and James N. Hall' 'Gideon Wyck'
 'Charles Nordloff and James Norman Hall'
 'Dennis Wheatley and J. H. Links' 'Somerset Maugham']


In [11]:
def resolve_illustrators(name):
    substring = ('. Illustrated by ')
    return name.split(substring)[0]
    
nyt_review['author_alt'] = nyt_review.apply(lambda x: resolve_illustrators(x['author_alt']), axis = 1)
nyt_review.author_alt.unique()[5:25]

array(['Charles Nordhoff and James Norman Hall', 'Armin L. Robinson',
       'Odell Shepard and Willard Shepard', '? by Isabel Bolton',
       'Benedict Freedman and Nancy Freedman',
       'Joseph Auslander and Audrey Wurdemann',
       'James Street and James Childers',
       'Charmian Clift and George Johnston',
       'Harnett T. Kane and Victor Leclerc', 'Marrijane and Joseph Hayes',
       'Dorothy Erskine and Patrick Dennis',
       'William J. Lederer and Eugene Burdick',
       'Arthur Quiller-Couch and Daphne du Maurier',
       'Fletcher Knebel and Charles W. Bailey II',
       'Eugene Burdick and Harvey Wheeler',
       'Terry Southern and Mason Hoffenberg', 'Nicholas Meyer',
       'J.D. Gilman and John Clive', 'Marvin Kalb and Ted Koppel',
       'General Sir John Hackett and Other Top-ranking NATO Generals and Advisors'],
      dtype=object)

In [12]:
#some custom cleaning
#typo in name
nyt_review.author_alt = nyt_review.author_alt.replace("Charles Nordloff","Charles Nordhoff", regex = True)
nyt_review.author_alt = nyt_review.author_alt.replace("William Shatner with Judith$","William Shatner with Judith Reeves-Stevens", regex = True)
nyt_review.author_alt = nyt_review.author_alt.replace("Constantini","Costantini", regex = True)
nyt_review.author_alt = nyt_review.author_alt.replace("J. T. Ellison","J.T. Ellison", regex = True)


In [13]:
def split_two_authors(name):
    substring1 = (' and ')
    substring2 = (' with ')
    if substring1 in name:
        first_author = name.split(substring1)[0]
        second_author = name.split(substring1)[1]
    elif substring2 in name:
        first_author = name.split(substring2)[0]
        second_author = name.split(substring2)[1]
    else:
        first_author = name
        second_author = np.nan
    return first_author, second_author

nyt_review[['author_1', 'author_2']] = nyt_review.apply(lambda x: split_two_authors(x['author_alt']), axis=1, result_type='expand')

In [14]:
nyt_review.author_1.unique()

array(['Charles Nordhoff', 'Gideon Wyck', 'Dennis Wheatley',
       'Somerset Maugham', 'Armin L. Robinson', 'Odell Shepard',
       '? by Isabel Bolton', 'Benedict Freedman', 'Joseph Auslander',
       'James Street', 'Charmian Clift', 'Harnett T. Kane', 'Marrijane',
       'Dorothy Erskine', 'William J. Lederer', 'Arthur Quiller-Couch',
       'Fletcher Knebel', 'Eugene Burdick', 'Terry Southern',
       'Nicholas Meyer', 'J.D. Gilman', 'Marvin Kalb',
       'General Sir John Hackett', 'Arnaud de Borchgrave',
       'Larry Collins', 'Helen Van Slyke', 'adapted by Joan D. Vinge',
       'created by Bill Adler', 'Robert Moss', 'Whitley Strieber',
       'Stephen King', 'Larry Niven', 'Patti Davis', '! by Terry Brooks',
       'Walter J. Boyne', 'compiled by Mary S. Lovell',
       'Raymond Chandler', 'Judith', 'Barry Lopez', 'Michael Dorris',
       'Margaret Weis', 'Arthur C. Clarke', 'Janet', '? by Jimmy Buffett',
       'adapted by Don Ferguson', 'Richard Marcinko', 'David',
       

In [15]:
#replace lost characters
nyt_review.author_1 = nyt_review.author_1.replace("! by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace(r"\? by ","", regex = True)

#replace other ways of saying editor
nyt_review.author_1 = nyt_review.author_1.replace("selected by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("compiled by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("translated by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("adapted by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("created by ","", regex = True)
#
nyt_review.author_1 = nyt_review.author_1.replace(". Completed by Carol Gino","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("William Shatner with Judith Reeves-Stevens","William Shatner", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("William Shatner with Judith","William Shatner", regex = True)

In [16]:
nyt_review.author_1.unique()

array(['Charles Nordhoff', 'Gideon Wyck', 'Dennis Wheatley',
       'Somerset Maugham', 'Armin L. Robinson', 'Odell Shepard',
       'Isabel Bolton', 'Benedict Freedman', 'Joseph Auslander',
       'James Street', 'Charmian Clift', 'Harnett T. Kane', 'Marrijane',
       'Dorothy Erskine', 'William J. Lederer', 'Arthur Quiller-Couch',
       'Fletcher Knebel', 'Eugene Burdick', 'Terry Southern',
       'Nicholas Meyer', 'J.D. Gilman', 'Marvin Kalb',
       'General Sir John Hackett', 'Arnaud de Borchgrave',
       'Larry Collins', 'Helen Van Slyke', 'Joan D. Vinge', 'Bill Adler',
       'Robert Moss', 'Whitley Strieber', 'Stephen King', 'Larry Niven',
       'Patti Davis', 'Terry Brooks', 'Walter J. Boyne', 'Mary S. Lovell',
       'Raymond Chandler', 'Judith', 'Barry Lopez', 'Michael Dorris',
       'Margaret Weis', 'Arthur C. Clarke', 'Janet', 'Jimmy Buffett',
       'Don Ferguson', 'Richard Marcinko', 'David', 'William J. Bennett',
       'Tom Hegg', 'Alex Walsh', 'William Shatner', 

In [17]:
nyt_review.author_2.unique()[:10]

array(['James N. Hall', nan, 'James Norman Hall', 'J. H. Links',
       'Willard Shepard', 'Nancy Freedman', 'Audrey Wurdemann',
       'James Childers', 'George Johnston', 'Victor Leclerc'],
      dtype=object)

In [18]:
# if there's only one name in the author_1 column, return that row
nyt_review[~nyt_review.author_1.str.contains(' ')]

Unnamed: 0,year,week,rank,title_id,title,author,problem,author_alt,author_1,author_2
13772,1957,1957-02-24,15,792,BON VOYAGE,Marrijane and Joseph Hayes,True,Marrijane and Joseph Hayes,Marrijane,Joseph Hayes
36330,1990,1990-09-16,10,3609,PRIME DIRECTIVE,Judith and Garfield Reeves-Stevens,True,Judith and Garfield Reeves-Stevens,Judith,Garfield Reeves-Stevens
36346,1990,1990-09-23,11,3609,PRIME DIRECTIVE,Judith and Garfield Reeves-Stevens,True,Judith and Garfield Reeves-Stevens,Judith,Garfield Reeves-Stevens
36361,1990,1990-09-30,11,3609,PRIME DIRECTIVE,Judith and Garfield Reeves-Stevens,True,Judith and Garfield Reeves-Stevens,Judith,Garfield Reeves-Stevens
36378,1990,1990-10-07,13,3609,PRIME DIRECTIVE,Judith and Garfield Reeves-Stevens,True,Judith and Garfield Reeves-Stevens,Judith,Garfield Reeves-Stevens
36395,1990,1990-10-14,15,3609,PRIME DIRECTIVE,Judith and Garfield Reeves-Stevens,True,Judith and Garfield Reeves-Stevens,Judith,Garfield Reeves-Stevens
37334,1991,1991-12-22,5,5511,THE JOLLY CHRISTMAS POSTMAN,Janet and Allan Ahlberg,True,Janet and Allan Ahlberg,Janet,Allan Ahlberg
37350,1991,1991-12-29,6,5511,THE JOLLY CHRISTMAS POSTMAN,Janet and Allan Ahlberg,True,Janet and Allan Ahlberg,Janet,Allan Ahlberg
37367,1992,1992-01-05,8,5511,THE JOLLY CHRISTMAS POSTMAN,Janet and Allan Ahlberg,True,Janet and Allan Ahlberg,Janet,Allan Ahlberg
37383,1992,1992-01-12,9,5511,THE JOLLY CHRISTMAS POSTMAN,Janet and Allan Ahlberg,True,Janet and Allan Ahlberg,Janet,Allan Ahlberg


In [19]:
#individually reunite the single names with their last names
nyt_review.author_1 = nyt_review.author_1.replace("Judith","Judith Reeves-Stevens", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("Janet","Janet Ahlberg", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("David","David Eddings", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("Faye","Faye Kellerman", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("Marrijane","Marrijane Hayes", regex = True)

nyt_review.author_1.unique()[:5]

array(['Charles Nordhoff', 'Gideon Wyck', 'Dennis Wheatley',
       'Somerset Maugham', 'Armin L. Robinson'], dtype=object)

In [20]:
nyt_single = pd.read_csv("nyt_single_author_demographics.csv")

nyt_single.rename(columns = {'Unnamed: 0':'author'}, inplace=True)
nyt_single.tail()

Unnamed: 0,author,birth,death,race_eth,education,institution
2015,Lana Del Rey,1985,,White,,
2016,Matt Haig,1975,,White,Bachelor's Degree,University of Kent
2017,Rumaan Alam,1977,,Pakistani-American,Bachelor’s Degree,University of Michigan
2018,Susie Yang,1983,,Asian American,MFA,University of California
2019,Halsey,1994,,White,Bachelor's Degree,Rhode Island School of Design


![join](IMG_0722.jpg)

In [22]:
# return all of the names in nyt_review.author_1 that are NOT in nyt_single.author
# going to use an outer merge

nyt_review_not_in_single = nyt_single.merge(nyt_review, how='outer', left_on = 'author', right_on = 'author_1', indicator = True)
nyt_review_not_in_single =nyt_review_not_in_single[nyt_review_not_in_single._merge == 'right_only']
print(len(nyt_review_not_in_single.author_1.unique()))
nyt_review_not_in_single.head()

61


Unnamed: 0,author_x,birth,death,race_eth,education,institution,year,week,rank,title_id,title,author_y,problem,author_alt,author_1,author_2,_merge
55,,,,,,,1995.0,1995-12-31,14.0,6336.0,THE SMALL ONE,Alex Walsh. Illustrated by Jesse Clay,True,Alex Walsh,Alex Walsh,,right_only
92,,,,,,,2008.0,2008-03-02,15.0,974.0,CELEBUTANTES,Amanda Goldberg and Ruthanna Khalighi Hopper,True,Amanda Goldberg and Ruthanna Khalighi Hopper,Amanda Goldberg,Ruthanna Khalighi Hopper,right_only
99,,,,,,,2011.0,2011-01-23,9.0,6836.0,THREE SECONDS,Anders Roslund and Borge Hellstrom,True,Anders Roslund and Borge Hellstrom,Anders Roslund,Borge Hellstrom,right_only
100,,,,,,,2011.0,2011-01-30,8.0,6836.0,THREE SECONDS,Anders Roslund and Borge Hellstrom,True,Anders Roslund and Borge Hellstrom,Anders Roslund,Borge Hellstrom,right_only
101,,,,,,,2011.0,2011-02-06,15.0,6836.0,THREE SECONDS,Anders Roslund and Borge Hellstrom,True,Anders Roslund and Borge Hellstrom,Anders Roslund,Borge Hellstrom,right_only


In [44]:
#write to csv
nyt_review_not_in_single.to_csv('nyt_reviewed_authors.csv', index=False)