In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import openai

In [2]:
nyt = pd.read_csv("nyt_full.tsv", sep = '\t')
nyt.head()

Unnamed: 0,year,week,rank,title_id,title,author
0,1931,1931-10-12,1,6477,THE TEN COMMANDMENTS,Warwick Deeping
1,1931,1931-10-12,2,1808,FINCHE'S FORTUNE,Mazo de la Roche
2,1931,1931-10-12,3,5304,THE GOOD EARTH,Pearl S. Buck
3,1931,1931-10-12,4,4038,SHADOWS ON THE ROCK,Willa Cather
4,1931,1931-10-12,5,3946,SCARMOUCHE THE KING MAKER,Rafael Sabatini


In [3]:
nyt.author.nunique()

2210

In [4]:
print(nyt.author)

0                        Warwick Deeping
1                       Mazo de la Roche
2                          Pearl S. Buck
3                           Willa Cather
4                        Rafael Sabatini
                      ...               
60381                             Halsey
60382                       Brit Bennett
60383                        Delia Owens
60384                    Fredrik Backman
60385    Clive Cussler and Boyd Morrison
Name: author, Length: 60386, dtype: object


In [5]:
# replace written & illustrated by 
nyt.author = nyt.author.replace("written and illustrated by ","", regex = True)

## separate complicated entries

In [14]:
#if there is a preposition, there's complication
nyt.author = nyt.author.replace(" And "," and ", regex = True)
nyt['problem'] = nyt.author.str.contains(' and | with | by ', regex = True, case = False)
print("how many problem entries", nyt.problem.sum())
print(nyt.problem.sum()/len(nyt))

# new dataframe with just problems
nyt_review = nyt[nyt.problem == True]
print("how many unique problems", len(nyt_review.author.unique()))

#new dataframe without problems 
nyt_single = nyt[nyt.problem == False]
nyt_single.to_csv('nyt_single_authors.csv', index=False)

how many problem entries 2416
0.04000927367270559
how many unique problems 189


In [16]:
# if there's an author with editor, remove editor, else treat editor as author 
nyt_review.author = nyt_review.author.replace(", edited by ",". Edited by ", regex = True)

#remove 'edited by ' and transfer rest of string to the 'author_alt' column
def resolve_editors(name):
    if 'dited ' in name:
        if name[:10] == 'edited by ':
            return name[10:]
        elif name[:10] != 'edited by ':
            substring = ('. Edited by ')
            return name.split(substring)[0]
    else: return name

nyt_review['author_alt'] = nyt_review.apply(lambda x: resolve_editors(x['author']), axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_review.author = nyt_review.author.replace(", edited by ",". Edited by ", regex = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_review['author_alt'] = nyt_review.apply(lambda x: resolve_editors(x['author']), axis = 1)


In [18]:
print(nyt_review['author_alt'].unique())

['Charles Nordhoff and James N. Hall' 'Gideon Wyck'
 'Charles Nordloff and James Norman Hall'
 'Dennis Wheatley and J. H. Links' 'Somerset Maugham'
 'Charles Nordhoff and James Norman Hall' 'Armin L. Robinson'
 'Odell Shepard and Willard Shepard' '? by Isabel Bolton'
 'Benedict Freedman and Nancy Freedman'
 'Joseph Auslander and Audrey Wurdemann' 'James Street and James Childers'
 'Charmian Clift and George Johnston' 'Harnett T. Kane and Victor Leclerc'
 'Marrijane and Joseph Hayes' 'Dorothy Erskine and Patrick Dennis'
 'William J. Lederer and Eugene Burdick'
 'Arthur Quiller-Couch and Daphne du Maurier'
 'Fletcher Knebel and Charles W. Bailey II'
 'Eugene Burdick and Harvey Wheeler' 'Terry Southern and Mason Hoffenberg'
 'Nicholas Meyer' 'J.D. Gilman and John Clive'
 'Marvin Kalb and Ted Koppel'
 'General Sir John Hackett and Other Top-ranking NATO Generals and Advisors'
 'Arnaud de Borchgrave and Robert Moss'
 'Larry Collins and Dominique Lapierre'
 'Helen Van Slyke with James Elward

In [20]:
def resolve_illustrators(name):
    substring = ('. Illustrated by ')
    return name.split(substring)[0]
    
nyt_review['author_alt'] = nyt_review.apply(lambda x: resolve_illustrators(x['author_alt']), axis = 1)
nyt_review.author_alt.unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_review['author_alt'] = nyt_review.apply(lambda x: resolve_illustrators(x['author_alt']), axis = 1)


array(['Charles Nordhoff and James N. Hall', 'Gideon Wyck',
       'Charles Nordloff and James Norman Hall',
       'Dennis Wheatley and J. H. Links', 'Somerset Maugham',
       'Charles Nordhoff and James Norman Hall', 'Armin L. Robinson',
       'Odell Shepard and Willard Shepard', '? by Isabel Bolton',
       'Benedict Freedman and Nancy Freedman',
       'Joseph Auslander and Audrey Wurdemann',
       'James Street and James Childers',
       'Charmian Clift and George Johnston',
       'Harnett T. Kane and Victor Leclerc', 'Marrijane and Joseph Hayes',
       'Dorothy Erskine and Patrick Dennis',
       'William J. Lederer and Eugene Burdick',
       'Arthur Quiller-Couch and Daphne du Maurier',
       'Fletcher Knebel and Charles W. Bailey II',
       'Eugene Burdick and Harvey Wheeler',
       'Terry Southern and Mason Hoffenberg', 'Nicholas Meyer',
       'J.D. Gilman and John Clive', 'Marvin Kalb and Ted Koppel',
       'General Sir John Hackett and Other Top-ranking NATO Gene

In [60]:
#some custom cleaning
#typo in name
nyt_review.author_alt = nyt_review.author_alt.replace("Charles Nordloff","Charles Nordhoff", regex = True)
nyt_review.author_alt = nyt_review.author_alt.replace("William Shatner with Judith$","William Shatner with Judith Reeves-Stevens", regex = True)
nyt_review.author_alt = nyt_review.author_alt.replace("Constantini","Costantini", regex = True)
nyt_review.author_alt = nyt_review.author_alt.replace("J. T. Ellison","J.T. Ellison", regex = True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_review.author_alt = nyt_review.author_alt.replace("Charles Nordloff","Charles Nordhoff", regex = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_review.author_alt = nyt_review.author_alt.replace("William Shatner with Judith$","William Shatner with Judith Reeves-Stevens", regex = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/

In [62]:
def split_two_authors(name):
    substring1 = (' and ')
    substring2 = (' with ')
    if substring1 in name:
        first_author = name.split(substring1)[0]
        second_author = name.split(substring1)[1]
    elif substring2 in name:
        first_author = name.split(substring2)[0]
        second_author = name.split(substring2)[1]
    else:
        first_author = name
        second_author = np.nan
    return first_author, second_author

nyt_review[['author_1', 'author_2']] = nyt_review.apply(lambda x: split_two_authors(x['author_alt']), axis=1, result_type='expand')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_review[['author_1', 'author_2']] = nyt_review.apply(lambda x: split_two_authors(x['author_alt']), axis=1, result_type='expand')


In [64]:
nyt_review.author_1.unique()

array(['Charles Nordhoff', 'Gideon Wyck', 'Dennis Wheatley',
       'Somerset Maugham', 'Armin L. Robinson', 'Odell Shepard',
       '? by Isabel Bolton', 'Benedict Freedman', 'Joseph Auslander',
       'James Street', 'Charmian Clift', 'Harnett T. Kane', 'Marrijane',
       'Dorothy Erskine', 'William J. Lederer', 'Arthur Quiller-Couch',
       'Fletcher Knebel', 'Eugene Burdick', 'Terry Southern',
       'Nicholas Meyer', 'J.D. Gilman', 'Marvin Kalb',
       'General Sir John Hackett', 'Arnaud de Borchgrave',
       'Larry Collins', 'Helen Van Slyke', 'adapted by Joan D. Vinge',
       'created by Bill Adler', 'Robert Moss', 'Whitley Strieber',
       'Stephen King', 'Larry Niven', 'Patti Davis', '! by Terry Brooks',
       'Walter J. Boyne', 'compiled by Mary S. Lovell',
       'Raymond Chandler', 'Judith', 'Barry Lopez', 'Michael Dorris',
       'Margaret Weis', 'Arthur C. Clarke', 'Janet', '? by Jimmy Buffett',
       'adapted by Don Ferguson', 'Richard Marcinko', 'David',
       

In [66]:
#replace lost characters
nyt_review.author_1 = nyt_review.author_1.replace("! by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("\? by ","", regex = True)

#replace other ways of saying editor
nyt_review.author_1 = nyt_review.author_1.replace("selected by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("compiled by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("translated by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("adapted by ","", regex = True)
nyt_review.author_1 = nyt_review.author_1.replace("created by ","", regex = True)
#
nyt_review.author_1 = nyt_review.author_1.replace(". Completed by Carol Gino","", regex = True)


  nyt_review.author_1 = nyt_review.author_1.replace("\? by ","", regex = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_review.author_1 = nyt_review.author_1.replace("! by ","", regex = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nyt_review.author_1 = nyt_review.author_1.replace("\? by ","", regex = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#return

In [68]:
nyt_review.author_1.unique()

array(['Charles Nordhoff', 'Gideon Wyck', 'Dennis Wheatley',
       'Somerset Maugham', 'Armin L. Robinson', 'Odell Shepard',
       'Isabel Bolton', 'Benedict Freedman', 'Joseph Auslander',
       'James Street', 'Charmian Clift', 'Harnett T. Kane', 'Marrijane',
       'Dorothy Erskine', 'William J. Lederer', 'Arthur Quiller-Couch',
       'Fletcher Knebel', 'Eugene Burdick', 'Terry Southern',
       'Nicholas Meyer', 'J.D. Gilman', 'Marvin Kalb',
       'General Sir John Hackett', 'Arnaud de Borchgrave',
       'Larry Collins', 'Helen Van Slyke', 'Joan D. Vinge', 'Bill Adler',
       'Robert Moss', 'Whitley Strieber', 'Stephen King', 'Larry Niven',
       'Patti Davis', 'Terry Brooks', 'Walter J. Boyne', 'Mary S. Lovell',
       'Raymond Chandler', 'Judith', 'Barry Lopez', 'Michael Dorris',
       'Margaret Weis', 'Arthur C. Clarke', 'Janet', 'Jimmy Buffett',
       'Don Ferguson', 'Richard Marcinko', 'David', 'William J. Bennett',
       'Tom Hegg', 'Alex Walsh', 'William Shatner', 

In [70]:
nyt_review.author_2.unique()

array(['James N. Hall', nan, 'James Norman Hall', 'J. H. Links',
       'Willard Shepard', 'Nancy Freedman', 'Audrey Wurdemann',
       'James Childers', 'George Johnston', 'Victor Leclerc',
       'Joseph Hayes', 'Patrick Dennis', 'Eugene Burdick',
       'Daphne du Maurier', 'Charles W. Bailey II', 'Harvey Wheeler',
       'Mason Hoffenberg', 'John Clive', 'Ted Koppel',
       'Other Top-ranking NATO Generals', 'Robert Moss',
       'Dominique Lapierre', 'James Elward', 'written by Thomas Chastain',
       'Arnaud de Borchgrave', 'James W. Kunetka', 'Peter Straub',
       'Jerry Pournelle', 'Maureen Strange Foster', 'Steven L. Thompson',
       'Robert B. Parker', 'Garfield Reeves-Stevens', 'Louise Erdrich',
       'Tracy Hickman', 'Gentry Lee', 'Allan Ahlberg', 'John Weisman',
       'Leigh Eddings', 'Kevin J. Anderson', 'Jerry B. Jenkins',
       'Katrina Kenison', 'Dave Wolverton', 'Carol Higgins Clark',
       'Sneaky Pie Brown', 'Andrew Gross', 'Nicola Kraus',
       'Paul Kempr

In [None]:
# if there's only one name in the author_1 column, 
