# Overview

This notebook takes in the raw datasets and add in the meta data for each article (author name, author position, and tags) by utilizing DP article APIs and staff positions CSV files obtained from the staff.

## Add Tags and Authors names

Obtain tags and author names of an article by calling DP article APIs

In [None]:
# import packages

import numpy as np
import pandas as pd
import requests as r
import math
import json

In [None]:
# load train data
train_df = pd.read_csv('train.csv', index_col=False)
train_df = train_df.drop(columns=['Unnamed: 0'])

# load test data
test_df = pd.read_csv('test.csv', index_col=False)
test_df = test_df.drop(columns=['Unnamed: 0'])

In [None]:
# function for getting the tags of an article
def getTags(titleURL):
  try:
    article = r.get(f'https://www.thedp.com{titleURL}.json').json()['article']
    return json.dumps([tag['slug'] for tag in article['tags']])
  except:
    return json.dumps([])

# function for getting the author names of an article
def getAuthors(titleURL):
  try:
    article = r.get(f'https://www.thedp.com{titleURL}.json').json()['article']
    return json.dumps([author['name'] for author in article['authors']])
  except:
    return json.dumps([])

In [None]:
test_df['authors'] = test_df['article'].apply(getAuthors)
test_df['tags'] = test_df['article'].apply(getTags)
test_df

In [None]:
train_df['authors'] = train_df['article'].apply(getAuthors)
train_df['tags'] = train_df['article'].apply(getTags)
train_df

In [None]:
# save author names and tags to csv files
test_df.to_csv('test_name_tag.csv', index=False)
train_df.to_csv('train_name_tag.csv', index=False)

## Processing Staff Positions CSVs

This section cleans up the CSVs obtained from DP staff and produces a dictionary that maps author names to their positions.

In [None]:
# this function combines positions in different departments into a single array

def combinePositions(row):
  exp = row['DP Experience']
  biz_pos = row['DP Business Department']
  ed_pos = row['Editorial Department']
  st_pos = row['Street Department']
  utb_pos = row['UTB Department']

  pos_list = []

  if exp:
    exp = exp.replace('\n', ' ')
    pos_list.append(exp[exp.find(':') + 2:])
  
  if biz_pos:
    biz_pos = biz_pos.replace('\n', ', ')
    pos_list.append(biz_pos)

  if ed_pos:
    ed_pos = ed_pos.replace('\n', ', ')
    pos_list.append(f'DP {ed_pos}')
  
  if st_pos:
    st_pos = st_pos.replace('\n', ', ')
    pos_list.append(f'Street {st_pos}')
  
  if utb_pos:
    utb_pos = utb_pos.replace('\n', ', ')
    pos_list.append(f'UTB {utb_pos}')
  
  return pos_list

In [None]:
# load the first CSV (all staff positions after 2015)
positions_df = pd.read_csv('drive/MyDrive/CIS520 Project/data set/DP Staff Positions/staff-1.csv', index_col=False, na_filter=False)

# combine first name and last time
positions_df['name'] = positions_df['Name'] + ' ' + positions_df['Last']
# lowercase the name
positions_df['name'] = positions_df['name'].apply(lambda x : x.lower())
# drop the columns of first name and last name
positions_df = positions_df.drop(columns=['Name', 'Last'])
# drop duplicates and keep the most recent position
positions_df = positions_df.drop_duplicates(subset=['name'], keep='last')
# combine positions using the function defined above
positions_df['positions'] = positions_df.apply(combinePositions, axis=1)
positions_df = positions_df.drop(columns=['DP Experience', 'DP Business Department', 'Editorial Department', 'Street Department', 'UTB Department'])

# set name to be the index key
positions_df = positions_df.set_index('name')

# convert the dataframe to a dictionary to allow easy retrieval of positions
# by name
pos_dict = positions_df.to_dict('index')

In [None]:
# load the second CSV (all staff positions from 2010-2015)
staff2_df = pd.read_csv('drive/MyDrive/CIS520 Project/data set/DP Staff Positions/staff-2.csv', index_col=False, na_filter=False)

# combine first name and last name
staff2_df['name'] = staff2_df['First Name'] + ' ' + staff2_df['Last Name']
# lowercase the anem
staff2_df['name'] = staff2_df['name'].apply(lambda x : x.lower())
staff2_df = staff2_df.drop(columns=['First Name', 'Last Name', 'DP Experience', 'Notes'])
# drop duplicates and keep the most recent position
staff2_df = staff2_df.drop_duplicates(subset=['name'], keep='last')

# set name to the index key and output a dictionary
staff2_df = staff2_df.set_index('name')
pos_dict_2 = staff2_df.to_dict('index')

In [None]:
# a dictionary that maps author name as appeared in the article
# to the author name appeared in the CSV files
ACTUAL_NAMES = {
    "tori sousa": "victoria sousa",
    "isaac lee": "enwook lee",
    "sanjary dureseti": "sanjay dureseti",
    "pat zancolli": "patrick zancolli",
    "nikkita collins": "nikki collins",
    "abigail baggini": "abby baggini",
    "jenn wright": "Jennifer Wright",
    "cathy han": "Kyoung Won (Cathy) Han",
    "alfredo praticò": "Alfredo Pratico'",
    "juan sebastián pinto": "juan Sebastián Pinto-Díaz",
    "tom nowlan": "thomas nowlan",
    "will snow": "william snow",
    "greg robinov": "gregory robinov",
    "will agathis": "william agathis",
    "sam altland": "samuel altland",
    "ben claar": "benjamin claar",
    "joe li": "zhiyao (joe) li",
    "maddy strohm": "madelyn strohm",
    "alexandra getsos": "alex getsos",
    "oscar a. rudenstam": "Oscar rudenstam",
    "christian gilberti": "christian read gilberti",
    "eunice lim": "Chan Mi (Eunice) Lim",
    "mike wisniewski": "michael wisniewski",
    "cherry zhi": "Qiu Yi (Cherry) Zhi",
    "luis ferre sadurni": "Luis Ferré Sadurní",
    "matt fine": "matthew fine",
    "jill moely": "Jillian moely",
    "alessandro van den brink": "Alexander van den brink",
    "oj singh": "Ojasvinee singh",
    "ali s mohammad": "ali mohammad",
    "noa ortiz": "noa Ortiz-Langleben",
    "amanda o'brien": "amanda O’Brien",
    "zach jacobs": "zachary jacobs",
    "christine olagun-samuel": "christine Olaogun",
    "jy_lee": "Jun Youb lee",
    "eason zhao": "Yixin (Eason) zhao",
    "aidan mayer ahearn": "aidan ahearn",
    "dia sotiropoulou": "Dionysia Sotiropoulou",
    "andie pinga": "andrea pinga",
    "chris schiller": "Christopher Schiller",
    "cass dinh": "Cassandra dinh",
    "michael a. keshmiri": "michael keshmiri",
    "m. earl smith": "martin smith",
    "albert chen-feng chou": "chen-feng chou",
    "lucien wang": "Alexander Lucien wang",
    "jessie washington": "jessica washington",
    "lavi ben dor": "lavi ben-dor",
    "dan eder": "daniel eder",
    "ben facey": "benjamin facey",
    "chris proano": "christopher proano",
    "theodore l. caputi": "theodore caputi",
    "matt mantica": "matthew mantica",
    "sergio w. guadix": "sergio guadix",
    "colleen o&#039;malley": "colleen O'Malley",
    "evie artis": "Qiana (Evie) artis",
    "ola osinaike": "Olatunbosun Osinaike",
    "becky demarre": "Rebecca demarre",
    "dan hayes": "daniel hayes",
    "nikki hardison": "Nikki (Christine) hardison",
    "ari goldfine": "ariel goldfine",
    "freda zhao": "Freda (Fang Bin) zhao",
    "yiwen chan": "Yiwen (Rachee) chan",
    "sanjay menghani": "sanjay meghani"
}

missing_names = []

# recursive function that retrieves the positions of the authors
# of an article
def getAuthorsPos(names):
  poss_ls = []

  for name in names:
    if ' and ' in name:
      poss_ls += getAuthorsPos(name.split(' and '))
      continue

    name = name.strip().lower()
    official_name = name
    actual_name = ACTUAL_NAMES.get(name, None)
    
    if actual_name:
      official_name = actual_name.lower()
    
    poss = pos_dict.get(official_name, None)
    poss_2 = pos_dict_2.get(official_name, None)

    if poss:
      poss_ls += poss['positions']
      continue
    if poss_2:
      poss_ls += poss_2['Specific Department']
      continue
    
    missing_names.append(name)
  
  return poss_ls

## Add Author Positions

With the function `getAuthorsPos` ready, we can now proceed to add the author positions to `test_df` and `train_df` by applying this function on the `author` column

In [None]:
# load train data
train_df = pd.read_csv('drive/MyDrive/CIS520 Project/data set/train_name_tag.csv', index_col=False)

# load test data
test_df = pd.read_csv('drive/MyDrive/CIS520 Project/data set/test_name_tag.csv', index_col=False)

# parse data by converting author and tags columns from JSON string to lists
def parse_json(df):
  df['authors'] = df['authors'].apply(lambda x : json.loads(x))
  df['tags'] = df['tags'].apply(lambda x : json.loads(x))
  return df

train_df = parse_json(train_df)
test_df = parse_json(test_df)

In [None]:
# count the number of articles that do not have metadata
train_count, _ = train_df[train_df['authors'].apply(lambda x : len(x) == 0)].shape
print(f'{train_count}/{train_df.shape[0]} articles in the train data do not have metadata')

test_count, _ = test_df[test_df['authors'].apply(lambda x : len(x) == 0)].shape
print(f'{test_count}/{test_df.shape[0]} articles in the train data do not have metadata')

567/16772 articles in the train data do not have metadata
138/4194 articles in the train data do not have metadata


In [None]:
# apply getAuthorsPos function

test_df['author_positions'] = test_df['authors'].apply(getAuthorsPos)
train_df['author_positions'] = train_df['authors'].apply(getAuthorsPos)

In [None]:
# getting missing names
from collections import Counter
result = Counter(missing_names)
result.most_common()

In [None]:
train_df[train_df['author_positions'].apply(lambda x : len(x) == 0)].shape

(1556, 22)

In [None]:
test_df[test_df['author_positions'].apply(lambda x : len(x) == 0)].shape

(412, 22)

In [None]:
# conver to JSON

def to_json(df):
  df['authors'] = df['authors'].apply(lambda x : json.dumps(x))
  df['tags'] = df['tags'].apply(lambda x : json.dumps(x))
  df['author_positions'] = df['author_positions'].apply(lambda x : json.dumps(x))
  return df

train_df = to_json(train_df)
test_df = to_json(test_df)

In [None]:
# store the data to CSV files

test_df.to_csv('test_name_pos_tag.csv', index=False)
train_df.to_csv('drive/MyDrive/CIS520 Project/data set/train_name_pos_tag.csv', index=False)