# Notebook to download data.

Useful for recreating data that we used in this project.

In [1]:
import os
import requests, json
from pathlib import Path
from urllib.parse import urlparse, unquote
from urllib.request import urlopen
from urllib.request import urlretrieve
import cgi
from requests.exceptions import RequestException
import re
from bs4 import BeautifulSoup
import zipfile
import pandas as pd
from os import listdir
from os.path import isfile, join


In [2]:
# Data directory
MYDIR = ("data")

In [3]:
def check_directory(directory_name):
    '''Check if directory exists, if not, create it'''
    # You should change 'test' to your preferred folder.
    CHECK_FOLDER = os.path.exists(f'{directory_name}')    
    # If folder doesn't exist, then create it.
    if not CHECK_FOLDER:
        os.makedirs(directory_name)
        print("created folder : ", directory_name)
    else:
        print(directory_name, "folder already exists.")

check_directory(MYDIR)

data folder already exists.


In [4]:
def get_full_file_path(filename):
    return os.path.join(MYDIR, filename)

def get_file(url, filename=None):

    try:
        with requests.get(url) as r:
            
            if filename:
                pass
            elif "Content-Disposition" in r.headers.keys():
                value, params = cgi.parse_header(r.headers["Content-Disposition"])
                filename = params["filename"]               
            else:
                filename = unquote(urlparse(url).path.split("/")[-1])
                     
            full_file_path = get_full_file_path(filename)
            open(full_file_path, "wb").write(r.content) 
            print(f'Saved {full_file_path}')
    except RequestException as e:
        print(e)

    return full_file_path



In [5]:
granted_patent_data_file = 'granted_patent_data.html'
granted_patent_data_url = 'https://patentsview.org/download/data-download-tables'
get_file(granted_patent_data_url, granted_patent_data_file)

pregrant_patent_data_file = 'pregrant_patent_data.html'
pregrant_patent_data_url = 'https://patentsview.org/download/pg-download-tables'
get_file(pregrant_patent_data_url, pregrant_patent_data_file)



Saved data\granted_patent_data.html
Saved data\pregrant_patent_data.html


'data\\pregrant_patent_data.html'

In [6]:
with open(get_full_file_path(granted_patent_data_file)) as fp:
    soup = BeautifulSoup(fp, 'html.parser')    
    granted_patent_data_links = soup.find_all(href=re.compile("s3"))
granted_patent_data_links = [link.get('href') for link in granted_patent_data_links]

In [7]:
onlyfiles = [f for f in listdir(MYDIR) if isfile(join(MYDIR, f))]
for url in granted_patent_data_links:
    print(f'Getting {url}')
    filename = unquote(urlparse(url).path.split("/")[-1])
    if(filename in onlyfiles):
        print("File gotten")
    else:
        print("File Not Gotten")
        filename = get_file(url)
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('.')


Getting https://s3.amazonaws.com/data.patentsview.org/download/application.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/assignee.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/botanic.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/cpc_current.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/cpc_group.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/cpc_subgroup.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/cpc_subsection.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/figures.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/foreigncitation.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.org/download/foreign_priority.tsv.zip
File gotten
Getting https://s3.amazonaws.com/data.patentsview.

## Create table schemas from the data dictionary

Gives us proper data types and protects us from importing bad data

In [None]:
# My first thought was to generate this straight from the html data dictionary page, but it's a mess
# I've left this in here in case anyone wants to give it a shot.
file = 'data-download-dictionary.html'
url = 'https://patentsview.org/download/data-download-dictionary'
get_file(url, file)

In [None]:
# I created patent_datadictionary.csv by hand because the html was just not easy to parse...
# Could go back and get this from the above cell but I'm not sure it is worth doing it.
df = pd.read_csv(get_full_file_path('patent_datadictionary.csv'), header=None)
df.columns = ['field', 'type']

### Generate Database table schemas from patent_datadictionary.csv

In [None]:
# Each table name is indicated by having a null type
table_names = df[df['type'].isnull()]
# just the actual table names and their place in the df
table_names_index = table_names.index.to_list()

# Iterating over the list of table names, finding what fields belong to them and then building the CREATE TABLE syntax.
# In an ideal work you build your tables in here as well but I just printed them out and ran them in my SQL IDE.
# I'll go back in and add the table creation here soon. This solution is real clunky.
for index, value in enumerate(table_names_index):    
    
    try:
        _df = df.iloc[table_names_index[index]:table_names_index[index+1], :]
    except:        
        _df = df.iloc[table_names_index[index]:len(df.index), :]

    _df = _df.reset_index()  # make sure indexes pair with number of rows

    for index, row in _df.iterrows():
        if(index == 0):
            create_table_string = f'CREATE TABLE {row["field"]} (\n'
        elif(index > 1):
            create_table_string += f'     {row["field"]} {row["type"]},\n'
        
    create_table_string = create_table_string[:-2] + '\n);'        
    print(create_table_string)
    print('-- '*50)   

    


### Loading the data into MYSQL

I used MYSQL Workbench to load the data, it's pretty big so I wanted to minimize any middlemen (like dataframes). There probably is a cleaner way to load it all from this notebook...

You can either download all the data with this notebook or just go to the UVA Box with all the files and download them from there.

Here is the SQL to load it from Workbench

<code>
SET GLOBAL local_infile = true;


truncate table patent; 


LOAD DATA 
	LOCAL INFILE 'path/to/file/patent.tsv' 
INTO TABLE patent 
COLUMNS 
	TERMINATED BY '\t'
	ENCLOSED BY '"'
    IGNORE 1 LINES;

</code>
