# Scraping Researcher Ages on Veromi

## Python Setup

In [1]:
from multiprocessing import Pool
import multiprocessing
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import unidecode
from veromi_query import query

## List of Individuals

Let's load in the list of individuals for whom we want to scrape the age. This list already has a unique ID variable per inventor.

In [2]:
ind = pd.read_csv('../data/raw/individuals_info.csv', low_memory=False)

In [3]:
# Dataset specific renaming:
ind.rename(columns={'inventor_id':'id'}, inplace = True)
# If there is no ID per individual, create one now.

In [4]:
ind = ind[['id', 'first_name', 'middle_name', 'last_name', 'city', 'state', 'country']]
ind = ind.drop_duplicates().reset_index(drop=True)

In [5]:
ind.describe(include='all')

Unnamed: 0,id,first_name,middle_name,last_name,city,state,country
count,3551945,3551829,1561853,3551925,3551212,1564618,3551929
unique,2567867,156155,37258,504081,235614,61,256
top,4477144-1,Michael,J.,Lee,Tokyo,CA,US
freq,179,62630,120910,28861,85784,309247,1566090


In [6]:
ind = ind[ind['country']=="US"].reset_index(drop=True)

In [7]:
ind.describe(include='all')

Unnamed: 0,id,first_name,middle_name,last_name,city,state,country
count,1566090,1566043,1143673,1566074,1565405,1564612,1566090
unique,1241107,69676,21621,287966,40499,61,1
top,5766379-2,John,J.,Smith,San Jose,CA,US
freq,51,51352,101215,7999,25232,309245,1566090


### Data Cleanup

In [8]:
string_vars = ['first_name', 'middle_name', 'last_name', 'city', 'state', 'country']
for var in string_vars:
    ind[var] = ind[var].str.upper().str.strip().fillna('')

## TO DO: REMOVE SPECIAL CHARACTERS

### Export to CSV

In [9]:
ind.to_csv('../data/individuals_clean.csv', index=False)

In [10]:
ind.head(10000).to_csv('../data/individuals_clean_sample.csv', index=False)

## Scrape Profiles

__Run `scraper.py` code.__

In [11]:
# !python scraper.py

Depending on the size of your data, you might want to alter the age_scraper code.

The following code works for a limited number of queries.
```
import pandas as pd
from veromi_query import query

# Load in data:
ind = pd.read_csv('../data/ind_clean.csv', low_memory=False)
ind = ind.fillna('')

# Query Veromi:
profiles = query([[0, ind, ind.shape[0]])

# Export to CSV:
profiles.to_csv('../output/scraper_output/individuals_clean_ages.csv', index=False)
```

When running a large number of queries, I adapt the code for parallelization. Here is the example for 3 parallel tasks.
```
from multiprocessing import Pool
import multiprocessing
import pandas as pd
from veromi_query import query

# Load in data:
ind = pd.read_csv('../data/ind_clean.csv', low_memory=False)
ind = ind.fillna('')

# Define Pool:
pool = Pool(processes=3)

# Run query for the job:
profiles0 = pool.apply_async(query, [[0, ind, 1000000]])
profiles1 = pool.apply_async(query, [[1, ind, 1000000]])
profiles2 = pool.apply_async(query, [[2, ind, 1000000]])
profiles0 = profiles0.get()
profiles1 = profiles1.get()
profiles2 = profiles2.get()

# Combine Frames:
frames = [profiles0, profiles1, profiles2]

profiles = pd.concat(frames).reset_index(drop=True)

# Export results as CSV
profiles.to_csv('../output/scraper_output/individuals_clean_ages.csv', index=False)
```
For this project, I used a clustered computing system and 100 parallel tasks.

## Parse Age from Data

In [12]:
# Load in entire dataset
ind_info = pd.read_csv('../output/scraper_output/individuals_clean_ages.csv', low_memory=True)

In [13]:
ind_info.describe(include='all')

Unnamed: 0,id,name_age,cities
count,5943466,5943466,5833892
unique,1107304,3890031,2151107
top,5218538-1,"SMITH, DAVID","HOUSTON, TX"
freq,878,1359,73235


In [14]:
ind_info.shape

(5943466, 3)

In [15]:
ind_info.drop_duplicates().shape

(5083319, 3)

In [16]:
ind_info['name'] = ind_info['name_age'].str.replace(r'\(.+\)', '').str.strip()
ind_info['age'] = ind_info['name_age'].str.extract(r'(\(Age\s\d+\))', expand=True)
ind_info['age'] = pd.to_numeric(ind_info['age'].str.extract(r'(\d+)', expand=False))
del ind_info['name_age']

In [17]:
ind_info.head()

Unnamed: 0,id,cities,name,age
0,3972338-1,"GREENVILLE, DE; PLYMOUTH, WI; ATLANTA, GA; MAR...","SCHMITT, DONALD HOLSTON",61.0
1,3972338-1,"CUMMING, GA; NORTHRIDGE, CA; NORTH HILLS, CA","SCHMITT, DONALD E",83.0
2,3972338-1,"WILMINGTON, NC; ATLANTA, GA","SCHMITT, DONALD BACHE",80.0
3,3972338-1,"SNELLVILLE, GA; LADY LAKE, FL; LITHONIA, GA","DONALD, SCHMITT",
4,3972338-1,"ATLANTA, GA; MARIETTA, GA; SOCIAL CIRCLE, GA; ...","SCHMITT, DONALD W",61.0


### Export Data with All Ages associated to a given Individual

In [18]:
# Temp:
ind_info.to_csv('../output/individual_ages_extensive.csv', index=False)

## To Do: Decide on the most relevant Age per Individual

For IDs that are affiliated to several potential ages, let's use the cities and name to decide on the most probable match.

For now, let's keep the most common age associated to a given Individual\*Location ID. This also gets rid on `NaN` values.

In [19]:
age_lookup = ind_info.groupby(['id', 'age']).size().reset_index().rename(columns={0: 'match_count'})
age_lookup = age_lookup.sort_values(['id', 'match_count'], ascending=[True, False])
age_lookup = age_lookup.drop_duplicates('id', keep='first')
del age_lookup['match_count']

## Merge back onto the Original Data of Individuals\*Locations

In [20]:
ind = pd.read_csv('../data/individuals_clean.csv')

In [21]:
ind = pd.merge(ind, age_lookup, how='left', on='id')

In [22]:
ind.describe(include='all')

Unnamed: 0,id,first_name,middle_name,last_name,city,state,country,age
count,1566090,1566024,1143648,1566019,1565405,1564612,1566090,1326465.0
unique,1241107,69206,21471,286234,39819,61,1,
top,5766379-2,JOHN,J.,SMITH,SAN JOSE,CA,US,
freq,51,51352,101216,7999,25237,309245,1566090,
mean,,,,,,,,58.78266
std,,,,,,,,13.16947
min,,,,,,,,18.0
25%,,,,,,,,50.0
50%,,,,,,,,58.0
75%,,,,,,,,66.0


In [23]:
ind.to_csv('../output/individual_age.csv', index=False)

## Sandbox