# Scraping Researcher Ages on Veromi

## Python Setup

In [2]:
from multiprocessing import Pool
import multiprocessing
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
import unidecode
from veromi_query import query

## List of Individuals

Let's load in the list of individuals for whom we want to scrape the age. This list already has a unique ID variable per inventor.

In [3]:
ind = pd.read_csv('../data/raw/individuals_info.csv', low_memory=False)

In [4]:
# Dataset specific renaming:
ind.rename(columns={'inventor_id':'id'}, inplace = True)
# If there is no ID per individual, create one now.

In [5]:
ind = ind[['id', 'first_name', 'middle_name', 'last_name', 'city', 'state', 'country']]
ind = ind.drop_duplicates().reset_index(drop=True)

In [6]:
ind.describe(include='all')

Unnamed: 0,id,first_name,middle_name,last_name,city,state,country
count,3551945,3551829,1561853,3551925,3551212,1564618,3551929
unique,2567867,156155,37258,504081,235614,61,256
top,4477144-1,Michael,J.,Lee,Tokyo,CA,US
freq,179,62630,120910,28861,85784,309247,1566090


In [7]:
ind = ind[ind['country']=="US"].reset_index(drop=True)

In [8]:
ind.describe(include='all')

Unnamed: 0,id,first_name,middle_name,last_name,city,state,country
count,1566090,1566043,1143673,1566074,1565405,1564612,1566090
unique,1241107,69676,21621,287966,40499,61,1
top,5766379-2,John,J.,Smith,San Jose,CA,US
freq,51,51352,101215,7999,25232,309245,1566090


### Data Cleanup

In [9]:
string_vars = ['first_name', 'middle_name', 'last_name', 'city', 'state', 'country']
for var in string_vars:
    ind[var] = ind[var].str.upper().str.strip().fillna('')

### Export to CSV

In [10]:
ind.to_csv('../data/individuals_clean.csv', index=False)

In [11]:
ind.head(10000).to_csv('../data/individuals_clean_sample.csv', index=False)

## Scrape Profiles

__Run `scraper.py` code.__

In [12]:
# !python scraper.py

Depending on the size of your data, you might want to alter the age_scraper code.

The following code works for a limited number of queries.
```
import pandas as pd
from veromi_query import query

# Load in data:
ind = pd.read_csv('../data/ind_clean.csv', low_memory=False)
ind = ind.fillna('')

# Query Veromi:
profiles = query([[0, ind, ind.shape[0]])

# Export to CSV:
profiles.to_csv('../output/scraper_output/individuals_clean_ages.csv', index=False)
```

When running a large number of queries, I adapt the code for parallelization. Here is the example for 3 parallel tasks.
```
from multiprocessing import Pool
import multiprocessing
import pandas as pd
from veromi_query import query

# Load in data:
ind = pd.read_csv('../data/ind_clean.csv', low_memory=False)
ind = ind.fillna('')

# Define Pool:
pool = Pool(processes=3)

# Run query for the job:
profiles0 = pool.apply_async(query, [[0, ind, 1000000]])
profiles1 = pool.apply_async(query, [[1, ind, 1000000]])
profiles2 = pool.apply_async(query, [[2, ind, 1000000]])
profiles0 = profiles0.get()
profiles1 = profiles1.get()
profiles2 = profiles2.get()

# Combine Frames:
frames = [profiles0, profiles1, profiles2]

profiles = pd.concat(frames).reset_index(drop=True)

# Export results as CSV
profiles.to_csv('../output/scraper_output/individuals_clean_ages.csv', index=False)
```
For this project, I used a clustered computing system and 100 parallel tasks.

## Parse Age from Data

In [12]:
# Load in entire dataset
profiles = pd.read_csv('../output/scraper_output/individuals_clean_ages.csv', low_memory=True)

In [13]:
profiles.head()

Unnamed: 0,id,name
0,0.0,"SYFRITT, HAROLD ARTHUR (Age 79)"
1,1.0,"SYFRITT, HAROLD ARTHUR (Age 79)"
2,2.0,"BAILIN, RICHARD C (Age 91)"
3,2.0,"BAILIN, RICHARD"
4,3.0,"VANBRAKEL, RUSSEL A (Age 81)"


In [14]:
profiles['age'] = profiles['name'].str.extract(r'(\(Age\s\d+\))', expand=True)
profiles['age'] = pd.to_numeric(profiles['age'].str.extract(r'(\d+)', expand=False))

For now, let's keep the most common age associated to a given Individual\*Location ID. This also gets rid on `NaN` values.

In [15]:
age_lookup = profiles.groupby(['id', 'age']).size().reset_index().rename(columns={0: 'match_count'})
age_lookup = age_lookup.sort_values(['id', 'match_count'], ascending=[True, False])
age_lookup = age_lookup.drop_duplicates('id', keep='first')

Let's also collect some matching stats.

In [16]:
matching_stats = profiles.groupby(['id'])
matching_stats = pd.DataFrame({'nb_matches': matching_stats.size(),
                               'nb_matches_with_age': matching_stats['age'].count(),
                               'nb_unique_ages': matching_stats['age'].nunique()
                              }).reset_index()

## Merge back onto the Original Data of Individuals\*Locations

In [20]:
ind = pd.read_csv('../data/ind_clean.csv')

In [21]:
ind = pd.merge(ind, age_lookup, how='left', on='id')
ind = pd.merge(ind, matching_stats, how='left', on='id')
ind['nb_matches'].fillna(0, inplace=True)
ind['nb_matches_with_age'].fillna(0, inplace=True)
ind['nb_unique_ages'].fillna(0, inplace=True)

In [22]:
ind.describe(include='all')

Unnamed: 0,id,first_name,middle_name,last_name,city,state,country,age,match_count,nb_matches,nb_matches_with_age,nb_unique_ages
count,2266802,2266711,1637791,2266688,2265349,2264690,2266802,0.0,0.0,2266802.0,2266802.0,2266802.0
unique,1796830,85489,27940,355369,22125,61,1,,,,,
top,5766379-2,JOHN,J.,SMITH,SAN JOSE,CA,US,,,,,
freq,67,78303,142609,12895,33910,447839,2266802,,,,,
mean,,,,,,,,,,0.0,0.0,0.0
std,,,,,,,,,,0.0,0.0,0.0
min,,,,,,,,,,0.0,0.0,0.0
25%,,,,,,,,,,0.0,0.0,0.0
50%,,,,,,,,,,0.0,0.0,0.0
75%,,,,,,,,,,0.0,0.0,0.0


In [23]:
ind['nb_unique_ages_t'] = np.where(ind['nb_unique_ages']<5, ind['nb_unique_ages'].apply(lambda x: str(x)), ">=5")
print("Number of unique ages by inventor:")
print(ind['nb_unique_ages_t'].value_counts(normalize=True))
del ind['nb_unique_ages_t']

Number of unique ages by inventor:
0.0    1.0
Name: nb_unique_ages_t, dtype: float64


### Assign Unique Age to Individual ID

In [None]:
ind_id_age = ind.groupby(['id', 'age'])['match_count'].sum().reset_index()
ind_id_age = ind_id_age.sort_values(['id', 'match_count'], ascending=[True, False])
ind_id_age = ind_id_age.drop_duplicates(['id'], keep='first')

# Merge to unique list of Individual IDs
unique_inds = ind[['id', 'first_name', 'middle_name', 'last_name']].drop_duplicates()
ind_id_age = pd.merge(unique_inds, ind_id_age, how='left', on='inventor_id')

In [None]:
ind_id_age.head()

In [None]:
ind_id_age.describe(include='all')

In [None]:
ind_id_age[['id', 'age']].to_csv('../output/ind_age.csv', index=False)

## Sandbox