# Prepare OpenCorporates Data with No States

The following code imports PatentsView data and the OpenCorporates API results to merge the two databases and identify organizations that can be found in both networks. OpenCorporates results in this script differ from previous versions because PatentsView states were not provided in the input file and any match between the two databases were matched only by organization name. Organization names in the OpenCorporates API results were cleaned to remove or standardize suffixes to facilitate better merge results between PatentsView and the OpenCorporates API results. This merge was performed to attach the unique assignee-state ID, location ID, city, and state from PatentsView to the OpenCorporates API results, and the data will be utilized to score the results. All records in the output file will be a representative for each organization, meaning all other records were removed via rules determined to maximize the likelihood a 'correct' record was selected. The records in the output file are then scored and normalized between a confidence level of 1-10.

In [1]:
### import the libraries used to process the PatentsView and OC data
import pandas as pd
import numpy as np
import time
import os
import re
import string
import warnings
warnings.filterwarnings('ignore')

### start timer
t0=time.time()

### set the path for the input file and save to variable
res_folder = "../csvResults/"
input_file = "noState_output_cleaned.csv"
a_full=os.path.join(res_folder,input_file)
print(a_full,"\n")

OC_results3=pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset 
display(OC_results3.info(null_counts=True),OC_results3.head())

../csvResults/noState_output_cleaned.csv 

Total time is 0.001878 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623 entries, 0 to 2622
Data columns (total 55 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2623 non-null   int64  
 1   assignee_id                           2623 non-null   object 
 2   name                                  2034 non-null   object 
 3   company_number                        2034 non-null   object 
 4   jurisdiction_code                     2034 non-null   object 
 5   incorporation_date                    2034 non-null   object 
 6   dissolution_date                      551 non-null    object 
 7   company_type                          2018 non-null   object 
 8   registry_url                          1680 non-null   object 
 9   branch                                897 non-null    object 
 10  branch_status

None

Unnamed: 0,ID,assignee_id,name,company_number,jurisdiction_code,incorporation_date,dissolution_date,company_type,registry_url,branch,...,agent_street_address,agent_city,agent_state,agent_zipcode,home_company_name,home_company_jurisdiction_code,controlling_entity_name,controlling_entity_jurisdiction_code,list_of_officers,list_of_filing_dates
0,1,5d5ead2f-1ef7-4db6-a6ce-9cdea523f834,,,,,,,,,...,,,,,,,,,[],[]
1,2,8a841c57-22b6-4ad0-ad42-16532c3ab4fc,,,,,,,,,...,,,,,,,,,[],[]
2,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,"LENDINGTREE, LLC",68510F,us_ak,1/3/2000,,Limited Liability Company,,F,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['Christopher D. Davies', 'Douglas R Lebda', '...","['2011-07-21', '2011-07-21', '2008-03-17', '20..."
3,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,"LENDINGTREE, LLC",1364151-0161,us_ut,6/27/1997,,LLC - Foreign,https://secure.utah.gov/bes/details.html?entit...,F,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['INCORP SERVICES, INC']",[]
4,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,"LENDINGTREE, LLC",34570412Q,us_la,8/29/1997,,Limited Liability Company (Non-Louisiana),http://coraweb.sos.la.gov/commercialsearch/Com...,F,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['DOUGLAS LEBDA', 'INCORP SERVICES, INC']","['2017-04-06', '2015-10-18', '2013-01-28', '20..."


In [3]:
### start timer
t0=time.time()

### set the path for the full PatentsView dataset and save to the fullData variable
input_file = "dfMergedFullDataSetNoForeign.csv"
a_full = os.path.join(res_folder,input_file)
print(a_full,"\n")

fullData = pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(fullData.info(null_counts=True),fullData.head())

../csvResults/dfMergedFullDataSetNoForeign.csv 

Total time is 0.008993 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252010 entries, 0 to 252009
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ID              252010 non-null  int64 
 1   assignee_id     252010 non-null  object
 2   location_id     252010 non-null  object
 3   organization    252010 non-null  object
 4   city            252010 non-null  object
 5   state           252010 non-null  object
 6   dateOfFirstPat  252010 non-null  object
dtypes: int64(1), object(6)
memory usage: 13.5+ MB


None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat
0,0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,"Close-In Solutions, LLC",Austin,TX,05/02/2005
1,1,fffe8fea-3d13-4016-9429-93653527efa1,fe449928-09bd-11ec-893a-12de62d610b1,"Vita-Stat Neducak Services, Inc.",St. Petersburg,FL,08/18/1977
2,2,fffe4688-bc00-4626-bd89-28921a62f07f,f76d85c4-09bd-11ec-893a-12de62d610b1,"Emerging Technology Systems, L.L.C.",Akron,OH,07/02/1996
3,3,fffe36bb-6dea-4a8b-8bf5-071cf893ceba,fe1cb1c3-cb8f-11eb-9615-121df0c29c1e,"Valley Business Solutions, LLC",Huntsville,AL,03/21/2019
4,4,fffd9c21-3bb1-4471-b316-d172921e3f83,ec16f9be-cb90-11eb-9615-121df0c29c1e,"RAILIAS HOLDINGS, LLC",San Diego,CA,10/16/2019


# Add Trademark Data to the Fulldata Data Set

In [4]:
### start timer
t0=time.time()

### set the path for the trademark data
res_folder = "../csvResults/"
input_file = "orgTrademarkNumbers.csv"
a_full = os.path.join(res_folder,input_file)
print(a_full,"\n")

tmData = pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(tmData.info(null_counts=True),tmData.head())

../csvResults/orgTrademarkNumbers.csv 

Total time is 0.018816 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441128 entries, 0 to 441127
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   organization  441128 non-null  object
 1   serial        441128 non-null  object
 2   city          441128 non-null  object
 3   state         441128 non-null  object
 4   ctrlEntity    441128 non-null  object
dtypes: object(5)
memory usage: 16.8+ MB


None

Unnamed: 0,organization,serial,city,state,ctrlEntity
0,!Hey Inc.,[75803548 76015762],North Andover,Massachusetts,Delaware
1,"!Magine This Renovations, Llc",[77689117 77731638],Navarre,Ohio,Ohio
2,"!Maginethis Renovations, Llc",[77689117 77731638],Navarre,Ohio,Ohio
3,"""21"" Brands, Inc.",[71591446 72056354 73478437 73551909],New York,New York,New York
4,"""21"" Club, Inc.",[71611185 72212922 72268705],New York,New York,New York


In [5]:
### start timer
t0=time.time()

### merge the trademark data to the PatentsView data (no foreign orgs)
addTm=fullData.merge(tmData,on=['organization'],how='outer',indicator=True)
addTm1=addTm.loc[(addTm['_merge']=='left_only') | (addTm['_merge']=='both')].iloc[:,:11].reset_index(drop=True)

### rename columns
addTm1.rename(columns={'city_x':'city','state_x':'state','city_y':'tradeCity','state_y':'tradeState'},inplace=True)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(addTm1.info(null_counts=True),addTm1.head())

Total time is 0.016537 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269661 entries, 0 to 269660
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              269661 non-null  float64
 1   assignee_id     269661 non-null  object 
 2   location_id     269661 non-null  object 
 3   organization    269661 non-null  object 
 4   city            269661 non-null  object 
 5   state           269661 non-null  object 
 6   dateOfFirstPat  269661 non-null  object 
 7   serial          47559 non-null   object 
 8   tradeCity       47559 non-null   object 
 9   tradeState      47559 non-null   object 
 10  ctrlEntity      47559 non-null   object 
dtypes: float64(1), object(10)
memory usage: 22.6+ MB


None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,ctrlEntity
0,0.0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,"Close-In Solutions, LLC",Austin,TX,05/02/2005,,,,
1,1.0,fffe8fea-3d13-4016-9429-93653527efa1,fe449928-09bd-11ec-893a-12de62d610b1,"Vita-Stat Neducak Services, Inc.",St. Petersburg,FL,08/18/1977,,,,
2,2.0,fffe4688-bc00-4626-bd89-28921a62f07f,f76d85c4-09bd-11ec-893a-12de62d610b1,"Emerging Technology Systems, L.L.C.",Akron,OH,07/02/1996,,,,
3,3.0,fffe36bb-6dea-4a8b-8bf5-071cf893ceba,fe1cb1c3-cb8f-11eb-9615-121df0c29c1e,"Valley Business Solutions, LLC",Huntsville,AL,03/21/2019,,,,
4,4.0,fffd9c21-3bb1-4471-b316-d172921e3f83,ec16f9be-cb90-11eb-9615-121df0c29c1e,"RAILIAS HOLDINGS, LLC",San Diego,CA,10/16/2019,,,,


# Standardize the Organization Names from the PatentsView Data and OpenCorporates Results

In [6]:
### this section cleans the organization and name fields by standardizing the names to facilitate better merge
### results. Skipping this section will yield poor results in any subsequent merges.

### start timer
t0=time.time()

### the pattern variable is utilized to create a 'list' of possible suffixes that should be removed
### from the organization and name fields. This list was constructed by manually inspecting the names
### prior to cleaning and does not represent a comprehensive list
pattern = '|'.join(['Llc','L\.L\.C\.','Inc\.$','Inc$','Ltd','\(.+?\)','Plc','P\.L\.C\.','Pllc','P\.L\.L\.C\.',
                    'Lp\.$','Lp$','Llp$','LP','L\.P\.','LC','L\.C\.','Ag$','Gmbh','SA$','Kg','Pvt','Sa$','BV','Nv$',
                    'Ab$','Pty$','SPA$','S\.P\.A\.','Bv','B\.V\.','B\.v\.','@','\.',','])

### converts the first character in each word to Uppercase and remaining characters to lowercase in 
### the string, followed by removing any whitespace that may exist to the left and right of the strings
addTm1['organization']=addTm1['organization'].str.title()
addTm1['organization']=addTm1['organization'].str.lstrip().str.rstrip()

### some strings must be replaced rather than removed because the resulting organization names would
### not make sense or match incorrectly. For example, Arjang & Co., which is the full name for the
### organization, would become Arjang and would match to multiple records via the merge instead of
### one. This was observed through multiple trials of cleaning the data
addTm1['organization']=addTm1['organization'].str.replace(' & ', ' And ')
addTm1['organization']=addTm1['organization'].str.replace('&', ' And ')
addTm1['organization']=addTm1['organization'].str.replace(' - |-', ' ')
addTm1['organization']=addTm1['organization'].str.replace('+', ' ')
addTm1['organization']=addTm1['organization'].str.replace(' (Co\.$|Co$)', ' Company')
addTm1['organization']=addTm1['organization'].str.replace(' (Corp\.|Corp) ', ' Corporation ')
addTm1['organization']=addTm1['organization'].str.replace('Mfg', 'Manufacturing')
addTm1['organization']=addTm1['organization'].str.replace('Incorporated|Usa', '')

### apply the pattern variable to the organization field and clean the resulting whitespace to the
### left and right of the strings
addTm1['organization']=addTm1['organization'].str.replace(pattern, '')
addTm1['organization']=addTm1['organization'].str.lstrip().str.rstrip()

### convert any remaining names that are not standardized
addTm1['organization']=addTm1['organization'].str.replace(' (Co\.$|Co$)', ' Company')
addTm1['organization']=addTm1['organization'].str.replace(' (Corp\.$|Corp$) ', ' Corporation ')
addTm1['organization']=addTm1['organization'].str.replace(' (Corp\.$|Corp$)', ' Corporation')

### converts the first character in each word to Uppercase and remaining characters to Lowercase in 
### the string, followed by removing any whitespace that may exist to the left and right of the strings
OC_results3['name']=OC_results3['name'].str.title()
OC_results3['name']=OC_results3['name'].str.lstrip().str.rstrip()

### some strings must be replaced rather than removed because the resulting organization names would
### not make sense or match incorrectly. For example, Arjang & Co., which is the full name for the
### organization, would become Arjang and would match to multiple records via the merge instead of
### one. This was observed through multiple trials of cleaning the data
OC_results3['name']=OC_results3['name'].str.replace(' & ', ' And ')
OC_results3['name']=OC_results3['name'].str.replace('&', ' And ')
OC_results3['name']=OC_results3['name'].str.replace(' - |-', ' ')
OC_results3['name']=OC_results3['name'].str.replace('+', ' ')
OC_results3['name']=OC_results3['name'].str.replace(' (Co\.$|Co$)', ' Company')
OC_results3['name']=OC_results3['name'].str.replace(' (Corp\.|Corp) ', ' Corporation ')
OC_results3['name']=OC_results3['name'].str.replace('Mfg', 'Manufacturing')
OC_results3['name']=OC_results3['name'].str.replace('Incorporated|Usa', '')

### apply the pattern variable to the name field and clean the resulting whitespace to the
### left and right of the strings
OC_results3['name']=OC_results3['name'].str.replace(pattern, '')
OC_results3['name']=OC_results3['name'].str.lstrip().str.rstrip()

### convert any remaining names that are not standardized
OC_results3['name']=OC_results3['name'].str.replace(' (Co\.$|Co$)', ' Company')
OC_results3['name']=OC_results3['name'].str.replace(' (Corp\.$|Corp$) ', ' Corporation ')
OC_results3['name']=OC_results3['name'].str.replace(' (Corp\.$|Corp$)', ' Corporation')

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

Total time is 0.049019 mins


In [7]:
### start timer
t0=time.time()

### force the state names to be lowercase in the tradeState and cntrlEntity fields
addTm1['tradeState']=addTm1['tradeState'].str.lower()
addTm1['ctrlEntity']=addTm1['ctrlEntity'].str.lower()

#convert states from full names to two-letter abbreviations under the address_state and agent_state features
addTm1.replace({'tradeState':{"california":"ca","connecticut":"ct","massachusetts":"ma","florida":"fl",
                                          "georgia":"ga","washington":"wa","new york":"ny","delaware":"de","tennessee":"tn",
                                          "missouri":"mo","texas":"tx","indiana":"in","minnesota":"mn","pennsylvania":"pa",
                                          "oregon":"or","virginia":"va","illinois":"il","kentucky":"ky","north carolina":"nc",
                                          "new jersey":"nj","colorado":"co","maryland":"md","ohio":"oh","arizona":"az",
                                          "nevada":"nv","utah":"ut","michigan":"mi","new hampshire":"nh","vermont":"vt",
                                          "kansas":"ks","oklahoma":"ok","iowa":"ia","louisiana":"la","rhode island":"ri",
                                          "wisconsin":"wi","hawaii":"hi","montana":"mt","nebraska":"ne",
                                          "district of columbia":"dc","west virginia":"wv","alabama":"al","idaho":"id",
                                          "maine":"me","new mexico":"nm","south carolina":"sc","north dakota":"nd",
                                          "south dakota":"sd","arkansas":"ar","alaska":"ak","wyoming":"wy",
                                          "mississippi":"ms"}},inplace=True)

addTm1.replace({'ctrlEntity':{"california":"ca","connecticut":"ct","massachusetts":"ma","florida":"fl",
                                          "georgia":"ga","washington":"wa","new york":"ny","delaware":"de","tennessee":"tn",
                                          "missouri":"mo","texas":"tx","indiana":"in","minnesota":"mn","pennsylvania":"pa",
                                          "oregon":"or","virginia":"va","illinois":"il","kentucky":"ky","north carolina":"nc",
                                          "new jersey":"nj","colorado":"co","maryland":"md","ohio":"oh","arizona":"az",
                                          "nevada":"nv","utah":"ut","michigan":"mi","new hampshire":"nh","vermont":"vt",
                                          "kansas":"ks","oklahoma":"ok","iowa":"ia","louisiana":"la","rhode island":"ri",
                                          "wisconsin":"wi","hawaii":"hi","montana":"mt","nebraska":"ne",
                                          "district of columbia":"dc","west virginia":"wv","alabama":"al","idaho":"id",
                                          "maine":"me","new mexico":"nm","south carolina":"sc","north dakota":"nd",
                                          "south dakota":"sd","arkansas":"ar","alaska":"ak","wyoming":"wy",
                                          "mississippi":"ms"}},inplace=True)

### convert the state names first letter Uppercase and second letter lowercase
addTm1['tradeState']=addTm1['tradeState'].str.title()
addTm1['ctrlEntity']=addTm1['ctrlEntity'].str.title()
addTm1['state']=addTm1['state'].str.title()

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset 
display(addTm1.info(null_counts=True),addTm1.head())

Total time is 0.011868 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269661 entries, 0 to 269660
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              269661 non-null  float64
 1   assignee_id     269661 non-null  object 
 2   location_id     269661 non-null  object 
 3   organization    269661 non-null  object 
 4   city            269661 non-null  object 
 5   state           269661 non-null  object 
 6   dateOfFirstPat  269661 non-null  object 
 7   serial          47559 non-null   object 
 8   tradeCity       47559 non-null   object 
 9   tradeState      47559 non-null   object 
 10  ctrlEntity      47559 non-null   object 
dtypes: float64(1), object(10)
memory usage: 22.6+ MB


None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,ctrlEntity
0,0.0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,Close In Solutions,Austin,Tx,05/02/2005,,,,
1,1.0,fffe8fea-3d13-4016-9429-93653527efa1,fe449928-09bd-11ec-893a-12de62d610b1,Vita Stat Neducak Services,St. Petersburg,Fl,08/18/1977,,,,
2,2.0,fffe4688-bc00-4626-bd89-28921a62f07f,f76d85c4-09bd-11ec-893a-12de62d610b1,Emerging Technology Systems,Akron,Oh,07/02/1996,,,,
3,3.0,fffe36bb-6dea-4a8b-8bf5-071cf893ceba,fe1cb1c3-cb8f-11eb-9615-121df0c29c1e,Valley Business Solutions,Huntsville,Al,03/21/2019,,,,
4,4.0,fffd9c21-3bb1-4471-b316-d172921e3f83,ec16f9be-cb90-11eb-9615-121df0c29c1e,Railias Holdings,San Diego,Ca,10/16/2019,,,,


# Extract Alternative and Previous Organization Names and Append to the OpenCorporates Result Dataset

In [8]:
### consecutive regex processes were performed to extract one or more organization names from the
### alternative_names and previous_names columns. When a match is absent, the if statement skips
### that record. If a match is found, the first regex variable is used to find all instances that
### match the pattern, then the second regex variable is utilized to further refine the string to
### reduce it to the organization name. A list is created using the nested for loop for each record.
### If multiple names are present, a nested list is created for that record. After the for loops 
### are finished running, the final list is created and printed for reviewing. This description
### applies for processing both the alternative_names and previous_names columns.

### start timer
t0=time.time()

### converts the first character in each word to Uppercase and remaining characters to Lowercase in 
### the string
OC_results3['alternative_names']=OC_results3['alternative_names'].str.title()
OC_results3['previous_names']=OC_results3['previous_names'].str.title()

### regex and regex1 were applied to initially find a list of organization names, followed by refining
### the names by removing erroneous characters
regex="'Company_Name': '[A-Z].+?'"
regex1="'[A-Z].+': "

### determines the length of the input data and creates 2 empty lists to be filled with the extracted
### organization names
b=len(OC_results3)
sub_finalAlt = []
finalAlt = []

### the for loop is initiated to review each record individually to extract the organization names
### under the alternative_names column
for j in range(b):
    
    ### if the record is empty or nan the record will be skipped
    if pd.isna(OC_results3.iloc[j,20]) is True:
        pass
    
    ### non-empty records have the curly brackets removed from the right and left of the list of
    ### strings. The findall function searches the list for all instances that match the regex
    ### variable and saves the list to the match variable. The length of this list is determined
    ### and provide to the nested loop below
    else:
        a=OC_results3.iloc[j,20][2:-2]
        match = re.findall(regex, a)
        c=len(match)
        
        ### for any length of c greater than 0, this for loop will evaluate each instance for the
        ### organization name by using the regex1 variable to remove characters that are not
        ### required. Once a match is found, the first character in each string is converted to
        ### an uppercase and saved to the sub_finalAlt list. The for loop repeats as many times
        ### that is equal to c
        for i in range(c):
            match1 = re.sub(regex1,"",match[i])
            match1=match1.replace("'","").title()
            
            ### ensures all spaces between strings are a single space
            match1 = re.sub('\s+',' ',match1)
            
            ### replaces specific characters
            match1=re.sub(' & ', ' And ',match1)
            match1=match1.replace('&', ' And ')
            match1=re.sub('Mfg', 'Manufacturing',match1)

            ### convert any names that are not standardized
            match1=re.sub(', Inc\.| Usa, Incorporated| Usa| Inc\.| Inc$| Incorpor$| Incorporated$|Incorporated', '',match1)
            match1=re.sub(', Llc$| Ltd\.| Ltd| Limited| Pty\.| Pty', '',match1)
            match1=re.sub('L\.L\.C\.| Llp$| Llc$|\(.+?\)|,Inc\.', '',match1)
            match1=re.sub(', P\.C\.| P\.C\.| P\. C\.| D\.M\.D\.|D\.D\.S\.| D\. D\. S\.| M\.D\.', '',match1)
            match1=re.sub(' Corporation$| Corporation,$| Corp\.$ | Corp.$| Corp\.$| Corp\.,$', 'Corporation',match1)
            match1=re.sub('Co\.$| Co$', 'Company',match1)
            
            ### removes all punctuation and ensures all spaces were single spaced after processing
            ### the strings
            match1 = match1.translate(str.maketrans('','',string.punctuation))
            match1 = re.sub('\s+',' ',match1)
            
            ### appends the strings to list
            sub_finalAlt.append(match1)
    
    ### after the nested for loop is finished extracting all possible organization names, the
    ### sub_finalAlt list is appended to the finalAlt list. The finalAlt list is updated with 
    ### the alternative names for each record after the for loops are finished and resets the
    ### sub_finalAlt to an empty list
    finalAlt.append(sub_finalAlt)
    sub_finalAlt = []

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins\n")

### prints the finalAlt list for review
print(finalAlt)

Total time is 0.002493 mins

[[], [], [], ['Servicemagic Home Loans', 'RealestateCom Realtors', 'Servicemagic Real Estate', 'Lendingtree PartnersCom', 'Magnifymoney', 'Getsmart', 'RealestateCom', 'Depositaccounts', 'Onlinebanks', 'Ramsey Group A Real EstateCom Company', 'Milecards', 'Student Loan Hero Of Ut', 'GetsmartCom', 'Snapcap'], [], ['RealestateCom Cancelled', 'GetsmartCom', 'Servicemagic Real Estate', 'Servicemagic Home Loans', 'Getsmart', 'RealestateCom Realtors Cancelled', 'Lendingtree PartnersCom', 'Depositaccounts', 'Magnifymoney', 'Milecards', 'Snapcap', 'Student Loan Hero'], ['Getsmart', 'Servicemagic Real Estate', 'GetsmartCom', 'Ramsey Group A Real EstateCom Company', 'Snapcap', 'RealestateCom', 'Servicemagic Home Loans', 'Onlinebanks', 'Lendingtree PartnersCom', 'Depositaccounts', 'Milecards', 'Student Loan Hero Of Ut', 'RealestateCom Realtors', 'Magnifymoney'], [], [], ['Nordstrom Audco', 'Nordstrom Valves'], [], [], [], [], [], [], [], [], ['Minop Company'], [], [], 

In [9]:
### start timer
t0=time.time()

### creates 2 empty lists to be filled with the extracted organization names
sub_finalPre = []
finalPre = []

regex="'Company_Name': '[A-Za-z].+?'"
regex1="'[A-Za-z].+': "

### the for loop is initiated to review each record individually to extract the organization names
### under the previous_names column
for j in range(b):
    
    ### if the record is empty or nan the record will be skipped
    if pd.isna(OC_results3.iloc[j,21]) is True:
        pass
    
    ### non-empty records have the curly brackets removed from the right and left of the list of
    ### strings. The findall function searches the list for all instances that match the regex
    ### variable and saves the list to the match variable. The length of this list is determined
    ### and provide to the nested loop below
    else:
        a=OC_results3.iloc[j,21][2:-2]
        match = re.findall(regex, a)
        c=len(match)

        ### for any length of c greater than 0, this for loop will evaluate each instance for the
        ### organization name by using the regex1 variable to remove characters that are not
        ### required. Once a match is found, the first character in each string is converted to
        ### an uppercase and saved to the sub_finalPre list. The for loop repeats as many times
        ### that is equal to c
        for i in range(c):
            match1 = re.sub(regex1,"",match[i])
            match1=match1.replace("'","").title().lstrip().rstrip()
            
            ### ensures all spaces between strings are a single space
            match1 = re.sub('\s+',' ',match1)
            
            ### replaces specific characters
            match1=re.sub(' & ', ' And ',match1)
            match1=match1.replace('&', ' And ')
            match1=re.sub('Mfg', 'Manufacturing',match1)

            ### convert any names that are not standardized
            match1=re.sub(', Inc\.| Usa, Incorporated| Usa| Inc\.| Inc$| Incorpor$| Incorporated$|Incorporated', '',match1)
            match1=re.sub(', Llc$| Ltd\.| Ltd| Limited| Pty\.| Pty', '',match1)
            match1=re.sub('L\.L\.C\.| Llp$| Llc$|\(.+?\)|,Inc\.', '',match1)
            match1=re.sub(', P\.C\.| P\.C\.| P\. C\.| D\.M\.D\.|D\.D\.S\.| D\. D\. S\.| M\.D\.', '',match1)
            match1=re.sub(' Corporation$| Corporation,$| Corp\.$ | Corp.$| Corp\.$| Corp\.,$', 'Corporation',match1)
            match1=re.sub('Co\.$| Co$', 'Company',match1)
            
            ### removes all punctuation and ensures all spaces were single spaced after processing
            ### the strings
            match1 = match1.translate(str.maketrans('','',string.punctuation))
            match1 = re.sub('\s+',' ',match1)

            sub_finalPre.append(match1)

    ### after the nested for loop is finished extracting all possible organization names, the
    ### sub_finalPre list is appended to the finalAlt list. The finalPre list is updated with 
    ### the previous names for each record after the for loops are finished and resets the
    ### sub_finalPre to an empty list
    finalPre.append(sub_finalPre)
    sub_finalPre = []

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins\n")

### prints the finalPre list for review
print(finalPre)

Total time is 0.002045 mins

[[], [], ['Lendingtree'], ['Creditsource', 'Lendingtree'], ['Lendingtree', 'Creditsource'], [], ['Lendingtree', 'Creditsource'], [], ['Creditsource', 'Lending Tree'], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Carson And Burger', 'Carson Burger And Weekly', 'Cbw Automation'], [], [], [], [], [], [], ['Las'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Las Enterprises'], [], [], [], [], [], [], [], [], [], [], [], ['Molecular Simulations', 'PolygenCorporation'], [], [], [], [], [], [], ['Powervar Canada', 'Twin City Computers'], [], [], [], [], [], [], [], [], [], ['Jibjab Media'], ['Jibjab Media'], [], [], [], [], [], [], ['Dielectrics Industries', 'Dielectrics'], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Paul Revere TobaccoCorporation'], [], ['Paul Revere Equity Management'], ['Paul Revere Equity Management Company The'], ['Wake Up America The Reds Are Coming Paul RevereJoeCorporation'], [], 

In [10]:
### start timer
t0=time.time()

### append the OC_results dataset with the finalAlt and finalPre lists with a _clean suffix
OC_results3['alternative_names_clean'] = finalAlt
OC_results3['previous_names_clean'] = finalPre

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(OC_results3.info(),OC_results3.head())

Total time is 0.000067 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623 entries, 0 to 2622
Data columns (total 57 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2623 non-null   int64  
 1   assignee_id                           2623 non-null   object 
 2   name                                  2034 non-null   object 
 3   company_number                        2034 non-null   object 
 4   jurisdiction_code                     2034 non-null   object 
 5   incorporation_date                    2034 non-null   object 
 6   dissolution_date                      551 non-null    object 
 7   company_type                          2018 non-null   object 
 8   registry_url                          1680 non-null   object 
 9   branch                                897 non-null    object 
 10  branch_status                         897 non-null    ob

None

Unnamed: 0,ID,assignee_id,name,company_number,jurisdiction_code,incorporation_date,dissolution_date,company_type,registry_url,branch,...,agent_state,agent_zipcode,home_company_name,home_company_jurisdiction_code,controlling_entity_name,controlling_entity_jurisdiction_code,list_of_officers,list_of_filing_dates,alternative_names_clean,previous_names_clean
0,1,5d5ead2f-1ef7-4db6-a6ce-9cdea523f834,,,,,,,,,...,,,,,,,[],[],[],[]
1,2,8a841c57-22b6-4ad0-ad42-16532c3ab4fc,,,,,,,,,...,,,,,,,[],[],[],[]
2,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,68510F,us_ak,1/3/2000,,Limited Liability Company,,F,...,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['Christopher D. Davies', 'Douglas R Lebda', '...","['2011-07-21', '2011-07-21', '2008-03-17', '20...",[],[Lendingtree]
3,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,1364151-0161,us_ut,6/27/1997,,LLC - Foreign,https://secure.utah.gov/bes/details.html?entit...,F,...,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['INCORP SERVICES, INC']",[],"[Servicemagic Home Loans, RealestateCom Realto...","[Creditsource, Lendingtree]"
4,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,34570412Q,us_la,8/29/1997,,Limited Liability Company (Non-Louisiana),http://coraweb.sos.la.gov/commercialsearch/Com...,F,...,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['DOUGLAS LEBDA', 'INCORP SERVICES, INC']","['2017-04-06', '2015-10-18', '2013-01-28', '20...",[],"[Lendingtree, Creditsource]"


In [11]:
### drop the alternative_names and previous_names columns and rearrange the alternative_names_clean 
### and previous_names_clean features
OC_results4=OC_results3
OC_results4.drop(labels=['alternative_names','previous_names'],axis=1,inplace=True)

thr_col = OC_results4.pop('alternative_names_clean')
for_col = OC_results4.pop('previous_names_clean')

OC_results4.insert(3, 'alternative_names_clean', thr_col)
OC_results4.insert(4, 'previous_names_clean', for_col)

### print general stats and first 5 records for dataset
display(OC_results4.info(),OC_results4.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623 entries, 0 to 2622
Data columns (total 55 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2623 non-null   int64  
 1   assignee_id                           2623 non-null   object 
 2   name                                  2034 non-null   object 
 3   alternative_names_clean               2623 non-null   object 
 4   previous_names_clean                  2623 non-null   object 
 5   company_number                        2034 non-null   object 
 6   jurisdiction_code                     2034 non-null   object 
 7   incorporation_date                    2034 non-null   object 
 8   dissolution_date                      551 non-null    object 
 9   company_type                          2018 non-null   object 
 10  registry_url                          1680 non-null   object 
 11  branch           

None

Unnamed: 0,ID,assignee_id,name,alternative_names_clean,previous_names_clean,company_number,jurisdiction_code,incorporation_date,dissolution_date,company_type,...,agent_street_address,agent_city,agent_state,agent_zipcode,home_company_name,home_company_jurisdiction_code,controlling_entity_name,controlling_entity_jurisdiction_code,list_of_officers,list_of_filing_dates
0,1,5d5ead2f-1ef7-4db6-a6ce-9cdea523f834,,[],[],,,,,,...,,,,,,,,,[],[]
1,2,8a841c57-22b6-4ad0-ad42-16532c3ab4fc,,[],[],,,,,,...,,,,,,,,,[],[]
2,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,[],[Lendingtree],68510F,us_ak,1/3/2000,,Limited Liability Company,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['Christopher D. Davies', 'Douglas R Lebda', '...","['2011-07-21', '2011-07-21', '2008-03-17', '20..."
3,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,"[Servicemagic Home Loans, RealestateCom Realto...","[Creditsource, Lendingtree]",1364151-0161,us_ut,6/27/1997,,LLC - Foreign,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['INCORP SERVICES, INC']",[]
4,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,[],"[Lendingtree, Creditsource]",34570412Q,us_la,8/29/1997,,Limited Liability Company (Non-Louisiana),...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['DOUGLAS LEBDA', 'INCORP SERVICES, INC']","['2017-04-06', '2015-10-18', '2013-01-28', '20..."


In [12]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000AddTradeExtAltPreNames.csv"
a_full = os.path.join(res_folder,outpt_file)

# OC_results4.to_csv(a_full,index=False)

# Extract Additional City and States from the Data Field and Append to the OpenCorporates Dataset

In [14]:
### this library allows the user to remove all punctuation in a string
import string 

### start timer
t0=time.time()

### the regex variables are applied to extract substrings in the data field and sequentially processed
### to remove most of the erroneous data
regex  = "'description': '[A-Za-z0-9].+?'"
regex1 = ": '[A-za-z0-9].+': "

### the b variable sets the length of the input data and the list of states will be used to extract the
### states from the data being processed. The empty lists are populated with the extracted data and will
### be utilized appended to the input dataframe
b=len(OC_results4)
stateList = "Al$|Ak|Az$|Ar$|Ca$|Co$|Ct$|Dc$|De$|Fl$|Ga$|Hi$|Id$|Il$|In$|Ia$|Ky$|Ks$|La$|Me$|Ma$|Md$|Mi$|Mn$|Ms$|Mo$| \
             Mt$|Ne$|Nv$|Nh$|Nj$|Nm$|Ny$|Nc$|Nd$|Oh$|Ok$|Or$|Pa$|Ri$|Sc$|Sd$|Tn$|Tx$|Ut$|Vt$|Va$|Wa$|Wv$|Wi$|Wy$|Fl$"
sub_city = []
city = []
sub_states1 = []
states = []

### this for loop initiates the search for city-state pairs within the data field. If there was no data
### found in the cell (Nan or Na), the if statement enters a blank for that record and moves to the next
### record. When a record was identified to contain data, the regex variable was applied with the findall
### function to extract all data matching the pattern. Lastly, the length of the match variable was
### determined and passed to the nexted for loop
for j in range(b):
    
    ### enters a blank for the record if no data is present
    if pd.isna(OC_results4.iloc[j,33]) is True:
        sub_states1 = []
        sub_city= []

    else:
        ### select the non-empty record and apply the findall function to obtain all matches, then
        ### calculate the length of the resulting string(s)
        a=OC_results4.iloc[j,33]
        match=re.findall(regex,a)
        c=len(match)

        ### each instance of city-state pairs identified in the findall function above were processed and
        ### standardized (e.g., city name two-letter state abbreviation). The following does not represent
        ### a comprehensive list of standardizing city-states
        for i in range(c):
            match1 = re.sub(regex1,"",match[i])
            match1 = re.sub('[0-9]',"",match1)
            match1 = re.sub('[0-9]',"",match1)
            match1 = re.sub('\s+',' ',match1)
            match1 = match1.replace("'","").title()
            match1 = match1.replace('Description: ',"")
            match1 = match1.replace('\\N'," ")
            match1 = match1.replace('\\n'," ")
            match1 = match1.replace('-',"")
            match1 = match1.replace('#',"")
            match1 = re.sub('Us$|USA|Usa|United States Of America|United States|Register Id:',"",match1)
            match1 = re.sub('Suite|Ste',"",match1)
            match1 = re.sub('Http.+\.[Cc]om',"",match1)
            match1 = re.sub(',$',"",match1)
            match1 = re.sub('/|:',"",match1)
            match1 = re.sub('[A-z].+(Department|Dept)\.|[A-z].+Dept',"",match1)
            match1 = re.sub('[A-z].+Bates',"",match1)
            match1 = re.sub('[A-z].+Larocque,',"",match1)
            match1 = re.sub('Apt ([A-z]|[A-z]\.)|Apt\.',"",match1)
            match1 = re.sub('[Pp].+?Box,|[Pp].+?Box ,|Po ,|Box , ',"",match1)
            match1 = re.sub('[A-z].+?Cook,',"",match1)
            match1 = re.sub('[A-z].+?Compliance, ',"",match1)
            match1 = re.sub('Th Floor, |Th Fl|Th Fl,',"",match1)
            match1 = re.sub('[A-z].+?Nova, ',"",match1)
            match1 = re.sub('[A-z].+?Ave,',"",match1)
            match1 = re.sub('[A-z].+?(Center|Ctr),',"",match1)
            match1 = re.sub('[A-z].+(Lane|Ln)',"",match1)            
            match1 = re.sub('(Bldg|Bldg\.) ([A-z]|[A-z],|[A-z]\.)|Bldg\.|Bldg',"",match1)
            match1 = re.sub('(Rd|Rd\.) (Floor,|Fl\.,|Fl,)',"",match1)
            match1 = re.sub('[A-Z].+?(Road|Rd\.|Rd)',"",match1)
            match1 = re.sub('[A-Z].+?Blvd|%Comet Glass Co, |Lcr ',"",match1)
            match1 = re.sub('[A-Z].+?Capitol| , By Corinne M Lude|(B,|B) |C, |^r, |^Z |^z ',"",match1)
            match1 = re.sub('[A-Z].+?Accounting, |[A-Z].+?Siuta, |Santa Helena|Fl , |E Th Saint ',"",match1)          
            match1 = re.sub('N Bay Village Fla',"North Bay Village, Fl",match1)
            match1 = re.sub('Mt\.|Mt',"Mount",match1)
            match1 = re.sub('Ft\.|Ft',"Mount",match1)
            match1 = re.sub('St\. |St ',"Saint ",match1)
            match1 = re.sub('Spgs',"Springs",match1)
            match1 = re.sub('Th Avenue South East, Mpls, Mn',"Minneapolis, Mn",match1)
            match1 = re.sub('No. Adams,, Ma, ',"North Adams, Ma",match1)
            match1 = re.sub('N Palm Beach, Fl',"North Palm Beach, Fl",match1)
            match1 = re.sub('Lk',"Lake",match1)
            match1 = re.sub('.+?(Pkwy|Parkway)|.+?Saint E, ',"",match1)
            match1 = re.sub('^Inc.+?(Place|Pl) |.+?(Place|Pl), |.+?(Plaza|Plz) ',"",match1)
            match1 = re.sub('Inc\., Irving Pl,',"",match1)
            match1 = re.sub('^Inc.+?(Street|St)|Inc.  N Frederick Ave L',"",match1)
            match1 = re.sub('.+?(Street|Street,|St\.,|St,) ',"",match1)
            match1 = re.sub('^(Inc|Inc\.,).+?(Drive|(Dr,|Dr)) |.+?(Highway|Hwy) |.+?(Freeway|Fwy) |.+?(Way|Wy) ',"",match1)
            match1 = re.sub('.+?(Drive|(Dr,|Dr|Dr\.,|Dr\.)|Drive,) |N Mi Saint',"",match1)
            match1 = re.sub('Glenroy|N Central Ave|Ne, |Nw |.+?Counsel, |.+?(Boulevard,|Boulevard) |Loockerman Square ',"",match1)
            match1 = re.sub('Lafox',"La Fox",match1)
            match1 = re.sub('Plymo$',"Plymouth",match1)
            match1 = re.sub('Portl$',"Portland",match1)
            match1 = re.sub('Flore$',"Florence",match1)
            match1 = re.sub('Southfi$',"Southfield",match1)
            match1 = re.sub('.+?(Longwood|Longwood,) Fl',"Longwood Fl",match1)
            match1 = re.sub('.+?p\.O\. Box|.+?(Highway|Hwy), |Inc. > box|P O Box |%.+?, |.+?(Avenue|Ave) ',"",match1)
            match1 = re.sub('^oor |Inc\.,|.+?(Court|Ct\.), |.+?(Avenue|Ave|Ave\.,), |.+?Ave |Inc\.  El Camino Real',"",match1)
            match1 = re.sub('E&A ennedy Space Ctr.,|E&A , Kennedy Space Ctr.,',"Kennedy Space Center,",match1)
            match1 = re.sub('.+?South Bend, In',"South Bend, In",match1)
            match1 = re.sub('.+?indio Ca',"Indio Ca",match1)
            match1 = re.sub('.+?[Ss]ioux Falls, ',"Sioux Falls, ",match1)
            match1 = re.sub('.+?(Plaza|Plz), |North Th Saint |.+?Belmont, ',"",match1)
            match1 = re.sub('.+?Belleville, Il,',"Belleville Il",match1)
            match1 = re.sub('Woodlawn Saint|s,|Spiceland|.+?(Circle|Cir)|Hillsboro Mile|Saint ,|.+?(Way|Wy)|N Court Saint',"",match1)
            match1 = re.sub('W\. Adam|Heritage Hill|Franklin Saint|.+?[Tt]ower |Pecan Saint W|Curti |.+?(Park|Pk) |S\.W Th',"",match1)
            match1 = re.sub('E Th Saint |Harlin Sr|Pmb|N\. Raymond|S Th|.+?(Pike|Pk)|.+?(Court|Ct)|.+?(Department|Dept)',"",match1)
            match1 = re.sub('West Tenth Saint|.+?Broadway|Boradway|Rr Box R|.+?(Route|Rte)|Box S\.|cl,|El Rio Saint',"",match1)
            match1 = re.sub('Village Square|Orange Saint|N Causeway|Plone Et Al|Madison Saint|P O Drawer Www',"",match1)
            match1 = re.sub('Po Bo|Chisholm Pl|.+?Ave\., |Prescott Saint|Titian|Picacho|Rusk Saint|Market Saint',"",match1)
            match1 = re.sub('Dairy Ashford Saint|Reichhold|Wall Saint|Webster Saint Fl Th|Webster Saint',"",match1)
            match1 = re.sub('(West|W) El Camino Real|Mountain (Trail|Trl)|W Th Saint  E|Rocky Mountain Fiber',"",match1)
            match1 = re.sub('Lake Saint Loui Mo',"Saint Louis Mo",match1)
            match1 = re.sub('Redhill Ave|Th Saint  |Th Saint ste|S\. Akard Saint|Sw Third |Rm |.+?(Loop|Loop,) |Alcor Body Work',"",match1)
            match1 = re.sub('.+?(Boulevard|Bvld)|Ave\., |.+?Solution|Braodway|Larkspur|.+?Turnpike|ele ,|Acorn Saint',"",match1)
            match1 = re.sub('.+?(Martius|Martiu) |Brush Saint |Nagog Park|Elm Saint|P\.O\. Box|Peaks Cv|Ne Brazee',"",match1)
            match1 = re.sub('Se, |Highway North|Rd Saint S|El Camino Real|.+?Management|E Eubank|.+?Product ',"",match1)
            match1 = re.sub('El Camino Real|.+?(Tax |Tax,)|Circle,|Northwestern Hgy|Rr |Putnam Saint|CO.+?Hill',"",match1)
            match1 = re.sub('Bissonnet Saint|S[ew] |West Tenth Stree|S E|Camino Del Tomasini|The Alameda',"",match1)
            match1 = re.sub('North Central Expressway|W Tenth Saint|Saint ste|W. Cummings Park|Prosper Saint',"",match1)
            match1 = re.sub('Glenlake Ave|Namco|N\. Central Ave|suite |N. Military Trail|.+?Alton Pl|Ludelle Saint',"",match1)
            match1 = re.sub('S\.W\. Terr|Dtn, Llc|.+?East Marcy|W Micheltorena|N Thompson Saint|CO Jos M Glickstein',"",match1)
            match1 = re.sub('S. Whittle Ave|S Orange Blossom Tr|Rt\.|Big Duke Trl|Von Karman \(\)|Hwy [A-Z] ',"",match1)
            match1 = re.sub('E Main Saint|Station, |Toledo Saint  A|Number |[EWNS] |ste |Woodlawn Av|Knox Av',"",match1)
            match1 = re.sub('depere, Wi',"De Pere, Wi",match1)
            match1 = re.sub('ScifAnnapolis Junction, Md,',"Annapolis Junction Md",match1)
                        
            ### capitalize the first letter in all strings, remove punctuation and whitespace around the
            ### the city-state strings. States were converted to two-letter abbreviations
            match1 = match1.title()
            match1 = match1.translate(str.maketrans('','',string.punctuation))
            match1 = match1.lstrip().rstrip()
            match1 = re.sub("California","Ca",match1)
            match1 = re.sub("Connecticut","Ct",match1)
            match1 = re.sub("Massachusetts|Massachusett","Ma",match1)
            match1 = re.sub("Nebraska","Ne",match1)
            match1 = re.sub("Florida","Fl",match1)
            match1 = re.sub("Georgia","Ga",match1)
            match1 = re.sub("Washington","Wa",match1)
            match1 = re.sub(" New York"," Ny",match1)
            match1 = re.sub("Delaware","De",match1)
            match1 = re.sub("Tennessee","Tn",match1)
            match1 = re.sub("Missouri","Mo",match1)
            match1 = re.sub("Texas","Tx",match1)
            match1 = re.sub("Indiana","In",match1)
            match1 = re.sub("Pennsylvania","Pa",match1)
            match1 = re.sub("Oregon","Or",match1)
            match1 = re.sub("Virginia","Va",match1)
            match1 = re.sub("Illinois|Illinoi","Il",match1)
            match1 = re.sub("Kentucky","Ky",match1)
            match1 = re.sub("North Carolina","Nc",match1)
            match1 = re.sub("New Jersey","Nj",match1)
            match1 = re.sub("Colorado","Co",match1)
            match1 = re.sub("Maryland","Md",match1)
            match1 = re.sub("Ohio","Oh",match1)
            match1 = re.sub("Arizona","Az",match1)
            match1 = re.sub("Nevada","Nv",match1)
            match1 = re.sub("Utah","Ut",match1)
            match1 = re.sub("Michigan","Mi",match1)
            match1 = re.sub("New Hampshire","Nh",match1)
            match1 = re.sub("Vermont","Vt",match1)
            match1 = re.sub("Kansas","Ks",match1)
            match1 = re.sub("Oklahoma","Ok",match1)
            match1 = re.sub("Iowa","Ia",match1)
            match1 = re.sub("Louisiana","La",match1)
            match1 = re.sub("Rhode Island","Ri",match1)
            match1 = re.sub("Wisconsin","Wi",match1)
            match1 = re.sub("Hawaii","Hi",match1)
            match1 = re.sub("Montana","Mt",match1)
            match1 = re.sub("District Of Columbia","Dc",match1)
            match1 = re.sub("West Virginia","Wv",match1)
            match1 = re.sub("Alabama","Al",match1)
            match1 = re.sub("Idaho","Id",match1)
            match1 = re.sub("Maine","Me",match1)
            match1 = re.sub("New Mexico","Nm",match1)
            match1 = re.sub("South Carolina","Sc",match1)
            match1 = re.sub("North Dakota","Nd",match1)
            match1 = re.sub("South Dakota","Sd",match1)
            match1 = re.sub("Arkansas","Ar",match1)
            match1 = re.sub("Alaska","Ak",match1)
            match1 = re.sub("Wyoming","Wy",match1)
            match1 = re.sub("Mississippi","Ms",match1)
            match1 = re.sub("Minnesota","Mn",match1)  
            match1 = re.sub("Virginia","Va",match1)  

            ### final set of cleaning for the city-state pairs prior to adding them to city list and
            ### state list that will be appended to the input dataframe
            match1 = re.sub('Inpoli',"Indianapolis",match1)
            match1 = re.sub('Dalla Tx',"Dallas Tx",match1)
            match1 = re.sub('Fla',"Fl",match1)
            match1 = re.sub('Kipling Memphi Tn|Memphi Tn',"Memphis Tn",match1)
            match1 = re.sub('White Pln Ny',"White Plain Ny",match1)
            match1 = re.sub('X  Hyrum Ut',"Hyrum Ut",match1)
            match1 = re.sub('Saint Loui Mo',"Saint Louis Mo",match1)
            match1 = re.sub('Jenner   Irvine Ca',"Irvine Ca",match1)
            match1 = re.sub('Los Angele Ca|Nh   Los Angeles Ca',"Los Angeles Ca",match1)
            match1 = re.sub('N West Palm Beach Fl',"West Palm Beach Fl",match1)
            match1 = re.sub('Y  Pasadena Tx',"Pasadena Tx",match1)
            match1 = re.sub('S Chicago Il',"Chicago Il",match1)
            match1 = re.sub('J Houston Tx',"Houston Tx",match1)
            match1 = re.sub('R Ny Ny|Saint Ny Ny',"New York Ny",match1)
            match1 = re.sub('Mission Wood K',"Mission Wood Ks",match1)
            match1 = re.sub('Saint Charles Mo|Saint Charle Mo',"Saint Charles Mo",match1)
            match1 = re.sub('Ne Oh',"Oh",match1)
            match1 = re.sub('Brooklyn Ny Ny',"Brooklyn Ny",match1)
            match1 = re.sub('F Austin Tx',"Austin Tx",match1)
            match1 = re.sub('Co Spring Co',"Colorado Springs Co",match1)
            
            ### uses the stateList variable to extract the states from the record. The join function is
            ### applied to prevent brackets added to the data frame in later steps
            sub_states1.append(''.join(re.findall(stateList,match1)))

            ### there are records where only the city is present. This if statement evaluates the
            ### length of the sub_states1 variable. A length equal to 0 corresponds to a record where no
            ### state is associated, only a city name. The string is appended to the sub_city variable.
            ### A length greater than 0 corresponds to a record that has a city-state pair and the
            ### string is sliced to remove the two-letter state and space between the city-state pair.
            ### The result is appended to the sub_city variable
            if len(sub_states1) == 0:
                sub_city.append(match1)
            
            else:
                sub_city.append(match1[:-3])

    ### the city and state is appended to a new list that will be added to the input dataframe. The lists
    ### in the nested for loop above were emptied and recycled for the next round of records.
    states.append(sub_states1)
    city.append(sub_city)
    sub_states1 = []
    sub_city = []

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### add the city and state lists to the input data frame and select the required columns
OC_results4['data_city']=city
OC_results4['data_state']=states

OC_results5=OC_results4.iloc[:,[1,2,3,4,6,7,52,12,42,43,46,47,55,56]]

### print general stats and first 5 records for dataset
display(OC_results5.info(),OC_results5.head())

Total time is 0.012566 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623 entries, 0 to 2622
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   assignee_id                           2623 non-null   object
 1   name                                  2034 non-null   object
 2   alternative_names_clean               2623 non-null   object
 3   previous_names_clean                  2623 non-null   object
 4   jurisdiction_code                     2034 non-null   object
 5   incorporation_date                    2034 non-null   object
 6   controlling_entity_jurisdiction_code  844 non-null    object
 7   branch_status                         897 non-null    object
 8   address_city                          976 non-null    object
 9   address_state                         961 non-null    object
 10  agent_city                            292 non-null    object
 11  ag

None

Unnamed: 0,assignee_id,name,alternative_names_clean,previous_names_clean,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,5d5ead2f-1ef7-4db6-a6ce-9cdea523f834,,[],[],,,,,,,,,[],[]
1,8a841c57-22b6-4ad0-ad42-16532c3ab4fc,,[],[],,,,,,,,,[],[]
2,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,[],[Lendingtree],us_ak,1/3/2000,us_de,branch of an out-of-jurisdiction company,CHARLOTTE,NC,,,[],[]
3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,"[Servicemagic Home Loans, RealestateCom Realto...","[Creditsource, Lendingtree]",us_ut,6/27/1997,us_de,branch of an out-of-jurisdiction company,CHARLOTTE,NC,,,[],[]
4,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,[],"[Lendingtree, Creditsource]",us_la,8/29/1997,us_de,branch of an out-of-jurisdiction company,BATON ROUGE,LA,,,"[Charlotte, Charlotte]","[Nc, Nc]"


In [15]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000AddTradeExtAltPreNamesExtData.csv"
a_full = os.path.join(res_folder,outpt_file)

# OC_results5.to_csv(a_full,index=False)

# Perform a Fuzzy Match Between the PatentsView Organization Name and the Alternative_Names_Clean and Previous_Names_Clean Fields

In [34]:
### perform a merge between the OC results and the full dataset to attach the PatentsView city and state
### to prepare the data for fuzzy matching between the two datasets
OC_results5.dropna(subset=['name'],inplace=True)

assingeeIDMerge=addTm1.merge(OC_results5,on=['assignee_id'],how='inner')
assingeeIDMerge['city'].replace("St\. ","Saint ",regex=True,inplace=True)
assingeeIDMerge['address_city'].replace("St\. |St ","Saint ",regex=True,inplace=True)

### print general stats and first 5 records for dataset
display(assingeeIDMerge.info(),assingeeIDMerge.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 24 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   tradeCity                             798 non-null    object 
 9   tradeState                            798 non-null    object 
 10  ctrlEntity                            798 non-null    object 
 11  name             

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,us_va,12/22/1997,,,ROANOKE,Virginia,ROANOKE,Virginia,[],[]
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,4/19/2005,,,,,,,[],[]
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_ca,6/23/1994,,,ANAHEIM,CA,,,[Irvine],[Ca]
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_ri,1/8/2019,us_de,branch of an out-of-jurisdiction company,VINEYARD HAVEN,MA,PROVIDENCE,RI,[],[]
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,1/3/2017,,,,,,,[],[]


In [18]:
### convert the city names and states to capitalize the first character in the string
assingeeIDMerge['state']=assingeeIDMerge['state'].str.title()
assingeeIDMerge['address_city']=assingeeIDMerge['address_city'].str.title()
assingeeIDMerge['address_state']=assingeeIDMerge['address_state'].str.lower()
assingeeIDMerge['agent_city']=assingeeIDMerge['agent_city'].str.title()
assingeeIDMerge['agent_state']=assingeeIDMerge['agent_state'].str.lower()

#convert states from full names to two-letter abbreviations under the address_state and agent_state features
assingeeIDMerge.replace({'address_state':{"california":"ca","connecticut":"ct","massachusetts":"ma","florida":"fl",
                                          "georgia":"ga","washington":"wa","new york":"ny","delaware":"de","tennessee":"tn",
                                          "missouri":"mo","texas":"tx","indiana":"in","minnesota":"mn","pennsylvania":"pa",
                                          "oregon":"or","virginia":"va","illinois":"il","kentucky":"ky","north carolina":"nc",
                                          "new jersey":"nj","colorado":"co","maryland":"md","ohio":"oh","arizona":"az",
                                          "nevada":"nv","utah":"ut","michigan":"mi","new hampshire":"nh","vermont":"vt",
                                          "kansas":"ks","oklahoma":"ok","iowa":"ia","louisiana":"la","rhode island":"ri",
                                          "wisconsin":"wi","hawaii":"hi","montana":"mt","nebraska":"ne",
                                          "district of columbia":"dc","west virginia":"wv","alabama":"al","idaho":"id",
                                          "maine":"me","new mexico":"nm","south carolina":"sc","north dakota":"nd",
                                          "south dakota":"sd","arkansas":"ar","alaska":"ak","wyoming":"wy",
                                          "mississippi":"ms"}},inplace=True)

assingeeIDMerge.replace({'agent_state':{"california":"ca","connecticut":"ct","massachusetts":"ma","florida":"fl",
                                          "georgia":"ga","washington":"wa","new york":"ny","delaware":"de","tennessee":"tn",
                                          "missouri":"mo","texas":"tx","indiana":"in","minnesota":"mn","pennsylvania":"pa",
                                          "oregon":"or","virginia":"va","illinois":"il","kentucky":"ky","north carolina":"nc",
                                          "new jersey":"nj","colorado":"co","maryland":"md","ohio":"oh","arizona":"az",
                                          "nevada":"nv","utah":"ut","michigan":"mi","new hampshire":"nh","vermont":"vt",
                                          "kansas":"ks","oklahoma":"ok","iowa":"ia","louisiana":"la","rhode island":"ri",
                                          "wisconsin":"wi","hawaii":"hi","montana":"mt","nebraska":"ne",
                                          "district of columbia":"dc","west virginia":"wv","alabama":"al","idaho":"id",
                                          "maine":"me","new mexico":"nm","south carolina":"sc","north dakota":"nd",
                                          "south dakota":"sd","arkansas":"ar","alaska":"ak","wyoming":"wy",
                                          "mississippi":"ms"}},inplace=True)

### convert the first letter in each string to Uppercase and lowercase the remaining strings
assingeeIDMerge['address_state']=assingeeIDMerge['address_state'].str.title()
assingeeIDMerge['agent_state']=assingeeIDMerge['agent_state'].str.title()

### print general stats and first 5 records for dataset
display(assingeeIDMerge.info(),assingeeIDMerge.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 24 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   tradeCity                             798 non-null    object 
 9   tradeState                            798 non-null    object 
 10  ctrlEntity                            798 non-null    object 
 11  name             

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,us_va,12/22/1997,,,Roanoke,Va,Roanoke,Va,[],[]
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,4/19/2005,,,,,,,[],[]
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_ca,6/23/1994,,,Anaheim,Ca,,,[Irvine],[Ca]
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_ri,1/8/2019,us_de,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,[],[]
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,1/3/2017,,,,,,,[],[]


In [19]:
### import the libraries required for fuzzy matching; scoring is between the organization names in PatentsView
### against the name, alternative_names_clean, and previous_names_clean features in the OpenCoporates results
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

### start timer
t0=time.time()

### determine the length of the data frame and create 2 empty lists
aa=len(assingeeIDMerge)
mat1=[]
mat2=[]

for i in range(aa):

    ### try/except is used to bypass cells with an NaN. Removing this will force the user
    ### to deal with exception errors
    try:
        ### calculate the fuzzy score between the organization name in PatentsView and name
        ### from the OpenCorporates results for further evaluation
        q=fuzz.ratio(assingeeIDMerge.iloc[i,3], assingeeIDMerge.iloc[i,11])

        ### most simplest and most representative in the data; if the score is 100, it is
        ### a perfect match, otherwise, the remaining code will resolve score less than 100
        if q == 100:
            mat1.append(q)
            mat2.append(assingeeIDMerge.iloc[i,3])
        
        ### this section resolves all q values not equal to 100
        elif ( q != 100 ):

            ### calculate the length of the data in each row for the alternative_names_clean
            ### column and previous_names_clean column
            if len(assingeeIDMerge.iloc[i,12]) == 0:
                r=0
            
            elif len(assingeeIDMerge.iloc[i,12]) > 0:
                r=len(assingeeIDMerge.iloc[i,12])
            
            if len(assingeeIDMerge.iloc[i,13]) == 0:
                d=0
            
            elif len(assingeeIDMerge.iloc[i,13]) > 0:
                d=len(assingeeIDMerge.iloc[i,13])
                        
            ### the remaining parts of the code uses if statements to step through the many
            ### conditions that may be present in the data. As each condition is satisfied,
            ### the mat1 and mat2 lists are appended with the data; take note that q is the
            ### original score and s, v, e, and f are separate scores that are compared
            ### against q. The first if statement evaluates the lengths of r and q, and if
            ### both are zero, skips the remaining code and appends mat1 and mat2 with the
            ### fuzzy score and organization name from OpenCorporates as the top hit
            if r == 0 and d == 0:
                mat1.append(q)
                mat2.append(assingeeIDMerge['name'][i])
                                         
            ### calculate the score for the alternative_names_clean and previous_names_clean
            ### columns. Once calculated, they are compared against each other and q to
            ### determine the score that is highest. The 'best' score is appened to mat1 and
            ### the name of the organization is appended to mat2. The same scorer is utilized 
            ### as the above but the process.extractOne function retrieves the organization
            ### match with the highest score and saves it as a tuple
            elif r == 1 and d == 1:
                s=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['alternative_names_clean'][i],scorer=fuzz.ratio)
                e=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['previous_names_clean'][i],scorer=fuzz.ratio)
                
                if s[1] > e[1] and s[1] > q:
                    mat1.append(s[1])
                    mat2.append(s[0])
                
                elif e[1] > s[1] and e[1] > q:
                    mat1.append(e[1])
                    mat2.append(e[0])
                
                elif e[1] == q or s[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
                
                else:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
            
            ### this section is the same as above, except the code is looking at the
            ### alternative_names_clean column only
            elif r == 1:
                s=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['alternative_names_clean'][i],scorer=fuzz.ratio)

                if s[1] > q:
                    mat1.append(s[1])
                    mat2.append(s[0])
                
                elif s[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
                    
                elif s[1] < q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
            
            ### this section is the same as above, except the code is looking at the
            ### previous_names_clean column only
            elif d == 1:
                e=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['previous_names_clean'][i],scorer=fuzz.ratio)

                if e[1] > q:
                    mat1.append(e[1])
                    mat2.append(e[0])
                
                elif e[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
                    
                elif e[1] < q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])      
            
            ## the following two sections resolve r and d lengths greater than 1 (i.e.,
            ## records that have more than 1 company names in the alternative_names_clean 
            ## and previous_names_clean columns)
            elif r > 1:
                v=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['alternative_names_clean'][i],scorer=fuzz.ratio)                

                if v[1] > q:
                    mat1.append(v[1])
                    mat2.append(v[0])
                    
                elif v[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
                    
                elif v[1] < q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
            
            elif d > 1:
                f=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['previous_names_clean'][i],scorer=fuzz.ratio)

                if f[1] > q:
                    mat1.append(f[1])
                    mat2.append(f[0])
                    
                elif f[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])                    
                    
                elif f[1] < q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])

    except:
        ### error handling that places an NaN for every cells that does not have a value
        ### in the mat1 and/or mat2 lists
        mat1.append(np.nan)
        mat2.append(np.nan)

### print general stats and first 5 records for dataset
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

Total time is 0.010273 mins


In [20]:
### start timer
t0=time.time()

### combines mat1 and mat2 into a single dataframe
finalList=[list(w) for w in zip(mat1, mat2)]
finalListDf=pd.DataFrame(finalList,columns=['scores','names'])

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(finalListDf.info(),finalListDf.head())

Total time is 0.000100 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2905 entries, 0 to 2904
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   scores  2905 non-null   int64 
 1   names   2905 non-null   object
dtypes: int64(1), object(1)
memory usage: 45.5+ KB


None

Unnamed: 0,scores,names
0,100,The Egg Factory
1,62,The Vision Tank
2,61,Tank Vision Environmental
3,100,Tank Vision
4,100,Tank Vision


In [21]:
### start timer
t0=time.time()

### appends the original dataframe with the results from the scoring steps in the preceeding code blocks
assingeeIDMerge['nameScores'] = finalListDf['scores']
assingeeIDMerge['matchNames'] = finalListDf['names']

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(assingeeIDMerge.info(),assingeeIDMerge.head())

Total time is 0.000017 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   tradeCity                             798 non-null    object 
 9   tradeState                            798 non-null    object 
 10  ctrlEntity                            798 non-null    ob

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state,nameScores,matchNames
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,,,Roanoke,Va,Roanoke,Va,[],[],100,The Egg Factory
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,,,,,,,[],[],62,The Vision Tank
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,,,Anaheim,Ca,,,[Irvine],[Ca],61,Tank Vision Environmental
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,[],[],100,Tank Vision
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,,,,,,,[],[],100,Tank Vision


In [22]:
### rearrange and select the features for further evaluation
assingeeIDMerge1=assingeeIDMerge
assingeeIDMerge1.drop(labels=['name','alternative_names_clean','previous_names_clean'],axis=1,inplace=True)

eigh_col = assingeeIDMerge1.pop('nameScores')
nine_col = assingeeIDMerge1.pop('matchNames')

assingeeIDMerge1.insert(8, 'nameScores', eigh_col)
assingeeIDMerge1.insert(9, 'matchNames', nine_col)

### print general stats and first 5 records for dataset
display(assingeeIDMerge1.info(),assingeeIDMerge1.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   nameScores                            2905 non-null   int64  
 9   matchNames                            2905 non-null   object 
 10  tradeCity                             798 non-null    object 
 11  tradeState       

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,nameScores,matchNames,...,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,100,The Egg Factory,...,us_va,12/22/1997,,,Roanoke,Va,Roanoke,Va,[],[]
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,62,The Vision Tank,...,us_de,4/19/2005,,,,,,,[],[]
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,61,Tank Vision Environmental,...,us_ca,6/23/1994,,,Anaheim,Ca,,,[Irvine],[Ca]
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,100,Tank Vision,...,us_ri,1/8/2019,us_de,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,[],[]
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,100,Tank Vision,...,us_de,1/3/2017,,,,,,,[],[]


In [23]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000AddTradeExtAltPreNamesExtDataFuzzyScores.csv"
a_full = os.path.join(res_folder,outpt_file)

# assingeeIDMerge1.to_csv(a_full,index=False)

# Match the City-States from the OpenCorporates Data Field to the PatentsView Data 

In [24]:
### start timer
t0=time.time()

### determine the length of data frame to set the range of the for loop
assigneeScores8=assingeeIDMerge1
t=len(assigneeScores8)

### set empty lists to be filled with each record
sub_city  = []
sub_state = []
city  = []
state = []

### for loop searches the data_city and data_state fields and matches to the city and states listed
### under the PatentsView data. If a match is identified the city or state is recorded and saved in
### the sub_city or sub_state list and appended to the city and state list, respectively. Any records
### with no data are skipped
for j in range(t):
    ### the try/except is applied to handle exception errors
    try:
        ### check the size, or number, of cities and states. If zero records exist, the record is
        ### skipped. If either equal one, the string is compared to the PatentsView string. If the
        ### size is larger than one, the nested loops for cities and states is used to check all
        ### instances against the PatentsView record
        d=len(assigneeScores8.iloc[j,21])
        c=len(assigneeScores8.iloc[j,22])

        if d == 0:
            pass
        
        elif d == 1:
            if assigneeScores8.iloc[j,21][0] == assigneeScores8.iloc[j,4]:
                sub_city.append(assigneeScores8.iloc[j,21][0])
            
            elif assigneeScores8.iloc[j,21][0] != assigneeScores8.iloc[j,4]:
                sub_city.append("")

        elif d > 1:
            ### if d is greater than one, the nested for loop is applied to check all instances
            ### against the PatentsView record. If a match is found, break is applied to end
            ### the nested for loop
            for k in range(d):
                if assigneeScores8.iloc[j,21][k]==assigneeScores8.iloc[j,4]:
                    sub_city.append(assigneeScores8.iloc[j,21][k])
                    break                    
        
        ### append the city list with matching records in sub_city and empty the sub_city list
        city.append(''.join(sub_city))
        sub_city = []
        
        
        if c == 0:
            pass
        
        elif c == 1:
            if assigneeScores8.iloc[j,22][0] == assigneeScores8.iloc[j,5]:
                sub_state.append(assigneeScores8.iloc[j,22][0])
                
        elif c > 1:
            ### if c is greater than one, the nested for loop is applied to check all instances
            ### against the PatentsView record. If a match is found, break is applied to end
            ### the nested for loop
            for k in range(c):
                if assigneeScores8.iloc[j,22][k] == assigneeScores8.iloc[j,5]:
                    sub_state.append(assigneeScores8.iloc[j,22][k])
                    break
        
        ### append the state list with matching records in sub_state and empty the sub_state list
        state.append(''.join(sub_state))
        sub_state = []
        
    except:
        city.append(np.nan)
        state.append(np.nan)
        
### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

Total time is 0.006582 mins


In [25]:
### start timer
t0=time.time()

### construct a dataframe with the city and states matching the PatentsView data
dataList=[list(w) for w in zip(city, state)]
dataListDf=pd.DataFrame(dataList,columns=['data_city','data_state']).reset_index(drop=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataListDf.info(),dataListDf.head())

Total time is 0.000050 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2905 entries, 0 to 2904
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   data_city   2905 non-null   object
 1   data_state  2905 non-null   object
dtypes: object(2)
memory usage: 45.5+ KB


None

Unnamed: 0,data_city,data_state
0,,
1,,
2,,
3,,
4,,


In [26]:
### start timer
t0=time.time()

### appends the original dataframe with the city-states that match the PatentsView data and select
### the columns for further processing
assigneeScores8.drop(labels=['data_city','data_state'],axis=1,inplace=True)

assigneeScores8['data_city'] = dataListDf['data_city']
assigneeScores8['data_state'] = dataListDf['data_state']
assigneeScores8['address_city'].replace("St\. ","Saint ",inplace=True,regex=True)
assigneeScores8['address_city'].replace("Mpls","Minneapolis",inplace=True,regex=True)
assigneeScores8['address_city'].replace("Muskegon, Mi","Muskegon",inplace=True,regex=True)

assigneeScores8['address_city'].replace("Null, |Att.+?Rothman,","",inplace=True,regex=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(assigneeScores8.info(),assigneeScores8.head())

Total time is 0.000332 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   nameScores                            2905 non-null   int64  
 9   matchNames                            2905 non-null   object 
 10  tradeCity                             798 non-null    ob

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,nameScores,matchNames,...,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,100,The Egg Factory,...,us_va,12/22/1997,,,Roanoke,Va,Roanoke,Va,,
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,62,The Vision Tank,...,us_de,4/19/2005,,,,,,,,
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,61,Tank Vision Environmental,...,us_ca,6/23/1994,,,Anaheim,Ca,,,,
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,100,Tank Vision,...,us_ri,1/8/2019,us_de,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,100,Tank Vision,...,us_de,1/3/2017,,,,,,,,


In [27]:
### start timer
t0=time.time()

### remove the us_ from the states under the jurisdiction_code column
assigneeScores9=assigneeScores8[assigneeScores8['jurisdiction_code'].str.contains('us_',na=False)]
assigneeScores9['subJurisCode']=assigneeScores9['jurisdiction_code'].str.slice(3).str.title()

assigneeScores9['subCntlEntity']=assigneeScores9['controlling_entity_jurisdiction_code'].str.slice(3).str.title()

assigneeScores10=assigneeScores9.iloc[:,[0,1,2,3,4,5,6,7,10,11,12,8,9,23,
                                         24,14,16,17,18,19,20,21,22]].sort_values(by=['ID'])

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(assigneeScores10.info(),assigneeScores10.head())

Total time is 0.000216 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2696 entries, 0 to 280
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  2696 non-null   float64
 1   assignee_id         2696 non-null   object 
 2   location_id         2696 non-null   object 
 3   organization        2696 non-null   object 
 4   city                2696 non-null   object 
 5   state               2696 non-null   object 
 6   dateOfFirstPat      2696 non-null   object 
 7   serial              715 non-null    object 
 8   tradeCity           715 non-null    object 
 9   tradeState          715 non-null    object 
 10  ctrlEntity          715 non-null    object 
 11  nameScores          2696 non-null   int64  
 12  matchNames          2696 non-null   object 
 13  subJurisCode        2696 non-null   object 
 14  subCntlEntity       1298 non-null   object 
 15  incorporation_date  2696 non

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,subJurisCode,subCntlEntity,incorporation_date,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,Va,,12/22/1997,,Roanoke,Va,Roanoke,Va,,
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,,4/19/2005,,,,,,,
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,Ca,,6/23/1994,,Anaheim,Ca,,,,
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,Ri,De,1/8/2019,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,,1/3/2017,,,,,,,


In [28]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000AddTradeExtAltPreNamesExtDataFuzzyScoresDataMatch.csv"
a_full = os.path.join(res_folder,outpt_file)

# assigneeScores10.to_csv(a_full,index=False)

# Count the Total Number of Unique Assignee_Ids

In [26]:
### import the libraries used to process the PatentsView and OC data.
import pandas as pd
import numpy as np
import time
import os
import re
import string
import warnings
warnings.filterwarnings('ignore')

### start timer
t0=time.time()

res_folder = "../csvResults/"
outpt_file = "OcResults1000AddTradeExtAltPreNamesExtDataFuzzyScoresDataMatch.csv"
a_full = os.path.join(res_folder,outpt_file)
print(a_full,"\n")

df=pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset 
display(df.info(null_counts=True),df.head())

../csvResults/OcResults1000AddTradeExtAltPreNamesExtDataFuzzyScoresDataMatch.csv 

Total time is 0.052893 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2696 entries, 0 to 2695
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  2696 non-null   float64
 1   assignee_id         2696 non-null   object 
 2   location_id         2696 non-null   object 
 3   organization        2696 non-null   object 
 4   city                2696 non-null   object 
 5   state               2696 non-null   object 
 6   dateOfFirstPat      2696 non-null   object 
 7   serial              715 non-null    object 
 8   tradeCity           715 non-null    object 
 9   tradeState          715 non-null    object 
 10  ctrlEntity          715 non-null    object 
 11  nameScores          2696 non-null   int64  
 12  matchNames          2696 non-null   object 
 13  subJurisCode        2696 non-null   object

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,subJurisCode,subCntlEntity,incorporation_date,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,Va,,12/22/1997,,Roanoke,Va,Roanoke,Va,,
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,,4/19/2005,,,,,,,
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,Ca,,6/23/1994,,Anaheim,Ca,,,,
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,Ri,De,1/8/2019,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,,1/3/2017,,,,,,,


In [27]:
### count the number of unique assignee_ids and IDs in the file
print("The number of unique assignee_ids in the input file is",df['assignee_id'].nunique())
print("The number of unique IDs in the input file is",df['ID'].nunique())

The number of unique assignee_ids in the input file is 378
The number of unique IDs in the input file is 510


# Find the Minimum Incorporation Date for each Organization and Each Organization Location

In [28]:
### start timer
t0=time.time()

### convert the incorporation_date data type
df['incorporation_date']= pd.to_datetime(df['incorporation_date'],errors='coerce')
df.rename(columns={'incorporation_date':'minIncDateForOrgLoc'},inplace=True)

### group the records by ID, Organization, and nameScores, then select the minimum date to identify
### the oldest date for the organization
minIncDateOrg=pd.DataFrame(df.groupby(by=['ID','organization','nameScores'],as_index=False)['minIncDateForOrgLoc'].min())
minIncDateOrg['minIncDateForOrgLoc']=minIncDateOrg['minIncDateForOrgLoc'].dt.strftime('%m/%d/%Y')
minIncDateOrg.rename(columns={'minIncDateForOrgLoc':'minIncDateForOrg'},inplace=True)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(minIncDateOrg.info(),minIncDateOrg.head())

Total time is 0.001446 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1091 entries, 0 to 1090
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   ID                1091 non-null   float64
 1   organization      1091 non-null   object 
 2   nameScores        1091 non-null   int64  
 3   minIncDateForOrg  1085 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 42.6+ KB


None

Unnamed: 0,ID,organization,nameScores,minIncDateForOrg
0,875.0,The Egg Factory,100,12/22/1997
1,1284.0,Tank Vision,61,06/23/1994
2,1284.0,Tank Vision,62,04/19/2005
3,1284.0,Tank Vision,100,01/03/2017
4,1667.0,Infinibox,100,01/31/2013


In [29]:
### start timer
t0=time.time()

### merge the minIncDateOrg data set with the df data
addedMinIncDates=minIncDateOrg.merge(df,on=['ID','organization','nameScores'],
                                     how='inner').iloc[:,[0,4,5,1,6,7,8,9,10,11,12,2,13,14,15,3,16,17,18,19,20,21,22,23]]

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(addedMinIncDates.info())

Total time is 0.000150 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2696 entries, 0 to 2695
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   2696 non-null   float64       
 1   assignee_id          2696 non-null   object        
 2   location_id          2696 non-null   object        
 3   organization         2696 non-null   object        
 4   city                 2696 non-null   object        
 5   state                2696 non-null   object        
 6   dateOfFirstPat       2696 non-null   object        
 7   serial               715 non-null    object        
 8   tradeCity            715 non-null    object        
 9   tradeState           715 non-null    object        
 10  ctrlEntity           715 non-null    object        
 11  nameScores           2696 non-null   int64         
 12  matchNames           2696 non-null   object        
 13  subJu

None

In [30]:
### verify the number of unique assignee_ids and IDs; the results should equal to the same values
### identified in the df data set
print("The number of unique assignee_ids in the input file is",addedMinIncDates['assignee_id'].nunique())
print("The number of unique IDs in the input file is",addedMinIncDates['ID'].nunique())

The number of unique assignee_ids in the input file is 378
The number of unique IDs in the input file is 510


# Identify Records with Home Controlling Entities

In [31]:
### start timer
t0=time.time()

### create a list to merge against the primary data set; the code selects records that are
### non-empty in the subCntlEntity column AND has a fuzzy score greater than 75
addedMinIncDates1=addedMinIncDates[(addedMinIncDates['subCntlEntity'].notna())]
withCntlEntity=addedMinIncDates1[addedMinIncDates1['nameScores']>75].sort_values(by=['ID','nameScores',
                                                                                     'minIncDateForOrg',
                                                                                     'minIncDateForOrgLoc'],
                                                                                 ascending=[True,False,True,True])

### visual inspection found this city was not spelled correctly
withCntlEntity['address_city']=withCntlEntity['address_city'].str.replace('Lafox','LaFox')

### count the number of unique assignee_ids and IDs
print("The number of unique assignee_ids with controlling entities is",withCntlEntity['assignee_id'].nunique())
print("The number of unique IDs with controlling entities is",withCntlEntity['ID'].nunique(),'\n')

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(withCntlEntity.info(),withCntlEntity.head())

The number of unique assignee_ids with controlling entities is 140
The number of unique IDs with controlling entities is 186 

Total time is 0.000166 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1082 entries, 3 to 2680
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   1082 non-null   float64       
 1   assignee_id          1082 non-null   object        
 2   location_id          1082 non-null   object        
 3   organization         1082 non-null   object        
 4   city                 1082 non-null   object        
 5   state                1082 non-null   object        
 6   dateOfFirstPat       1082 non-null   object        
 7   serial               445 non-null    object        
 8   tradeCity            445 non-null    object        
 9   tradeState           445 non-null    object        
 10  ctrlEntity           445 non-null    object      

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,subCntlEntity,minIncDateForOrg,minIncDateForOrgLoc,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,01/03/2017,2019-01-08,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,
8,2729.0,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,01/15/2014,,,,...,De,01/02/2013,2016-09-02,branch of an out-of-jurisdiction company,,,,,,Dc
14,3756.0,dbd08974-b2f8-4753-b720-27b2fdff8b65,419babc4-cb8e-11eb-9615-121df0c29c1e,Pendpac,Fairview,Ok,10/23/2001,,,,...,Az,02/12/2003,2003-08-07,branch of an out-of-jurisdiction company,,,,,,
13,3756.0,dbd08974-b2f8-4753-b720-27b2fdff8b65,419babc4-cb8e-11eb-9615-121df0c29c1e,Pendpac,Fairview,Ok,10/23/2001,,,,...,Az,02/12/2003,2004-04-08,branch of an out-of-jurisdiction company,Justin,Tx,,,,
21,6626.0,bf924c03-bf8a-4fef-b6e7-6b54e3faad33,ce35392a-cb8e-11eb-9615-121df0c29c1e,Atx Telecom Systems,Naperville,Il,12/22/1995,,,,...,De,02/27/1986,1995-03-27,branch of an out-of-jurisdiction company,Naperville,Il,,,,Il


In [32]:
### start timer
t0=time.time()

### label each record to indicate if the state from PatentsView matches the Jurisdiction
### code or other states provided by OpenCorporates - exact match to Juris code or 
### controlling entity (2), match to other states (1), no match (0). The records labeled
### zero must be processed in subsequent steps
withCntlEntity['match']=np.where(withCntlEntity['state']==withCntlEntity['subJurisCode'],2,
                          np.where(withCntlEntity['state']==withCntlEntity['subCntlEntity'],2,
                          np.where(withCntlEntity['ctrlEntity']==withCntlEntity['subCntlEntity'],2,
                          np.where(withCntlEntity['state']==withCntlEntity['address_state'],1,
                          np.where(withCntlEntity['state']==withCntlEntity['agent_state'],1,
                          np.where(withCntlEntity['state']==withCntlEntity['data_state'],1,0))))))

### count the number of unique assignee_ids and IDs
print("The number of unique assignee_ids is",withCntlEntity['assignee_id'].nunique())
print("The number of unique IDs is",withCntlEntity['ID'].nunique())

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

The number of unique assignee_ids is 140
The number of unique IDs is 186
Total time is 0.000033 mins


In [33]:
### start timer
t0=time.time()

### filter for records coded 2 (& greater than 85 fuzzy score), 1 and 0; save each batch as a
### new variable
exactMatch=withCntlEntity.loc[(withCntlEntity['match']==2) & (withCntlEntity['nameScores']>85)]
matchToState=withCntlEntity.loc[withCntlEntity['match']==1]
noMatch=withCntlEntity.loc[withCntlEntity['match']==0]

### concat records that were coded 1 and 2; the remaining records will need to be processed in later steps
mergeExactToState=pd.concat([exactMatch,matchToState],axis=0)

### code the concatenated data set 1 or 0 by matching the city with address_city, agent_city or data_city
mergeExactToState1=mergeExactToState.iloc[:,:24]
mergeExactToState1['cityMatch']=np.where(mergeExactToState1['city']==mergeExactToState1['address_city'],1,
                                np.where(mergeExactToState1['city']==mergeExactToState1['agent_city'],1,
                                np.where(mergeExactToState1['city']==mergeExactToState1['data_city'],1,0)))

### sort values using the new feature along with existing features
mergeExactToState2=mergeExactToState1.sort_values(by=['organization','nameScores','cityMatch',
                                                      'address_city','address_state','agent_city','agent_state',
                                                      'data_city','data_state'],
                                                  ascending=[True,False,False,
                                                             False,False,False,False,False,False])

### there were cases where duplicates exist; the code below identifies the records and keeps the first
### record for each organization
mergeExactToStateRmDups=mergeExactToState2.drop_duplicates(subset=['organization'],keep='first') ### ready to concat

### count the number of unique assignee_ids and IDs in the data set that removed duplicates
print("The number of unique assignee_ids is",mergeExactToStateRmDups['assignee_id'].nunique())
print("The number of unique IDs is",mergeExactToStateRmDups['ID'].nunique(),'\n')

### count the number of unique assignee_ids and IDs in the data set that had no matches with states
print("The number of unique assignee_ids is",noMatch['assignee_id'].nunique())
print("The number of unique IDs is",noMatch['ID'].nunique())

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

The number of unique assignee_ids is 115
The number of unique IDs is 115 

The number of unique assignee_ids is 69
The number of unique IDs is 82
Total time is 0.000266 mins


In [34]:
### start timer
t0=time.time()

### extract the IDs and organization names that already has a match between PatentsView and OpenCorporates
mergeExactToStateIDs=mergeExactToState.iloc[:,0:1].drop_duplicates(keep='first')
mergeExactToStateOrgs=mergeExactToState.iloc[:,3:4].drop_duplicates(keep='first')

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(mergeExactToStateIDs.head(),mergeExactToStateOrgs.head())

Total time is 0.000050 mins


Unnamed: 0,ID
14,3756.0
31,9505.0
76,12510.0
87,16597.0
94,17467.0


Unnamed: 0,organization
14,Pendpac
31,Harris Mud And Chemical
76,Super Cut
87,Precision Process Equipment
94,Del Monte Fresh Fruit Company


In [35]:
### start timer
t0=time.time()

### merge the noMatch data set with the mergeExactToStateIDs data set to construct a data set
### that removes assignee_ids and organizations already identified to have a match
sub_noMatch=noMatch.merge(mergeExactToStateIDs,on=['ID'],how='outer',indicator=True)
sub_noMatch1=sub_noMatch.loc[sub_noMatch['_merge']=='left_only'].iloc[:,:25]
sub_noMatch2=sub_noMatch1.merge(mergeExactToStateOrgs,on=['organization'],how='outer',indicator=True)
sub_noMatch3=sub_noMatch2.loc[sub_noMatch2['_merge']=='left_only'].iloc[:,:25]

### count the number of unique assignee_ids and IDs in the data set
print("The number of unique assignee_ids is",sub_noMatch3['assignee_id'].nunique())
print("The number of unique IDs is",sub_noMatch3['ID'].nunique())

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

The number of unique assignee_ids is 23
The number of unique IDs is 27
Total time is 0.000349 mins


In [36]:
### start timer
t0=time.time()

### sort the data and select the first record
sub_noMatch3Sort=sub_noMatch3.sort_values(by=['organization','nameScores','minIncDateForOrg','minIncDateForOrgLoc'],
                                          ascending=[True,False,True,True])
sub_noMatch3KeepFirst=sub_noMatch3Sort.drop_duplicates(subset=['organization'],keep='first').reset_index(drop=True)

### count the number of unique assignee_ids and IDs in the data set
print("The number of unique assignee_ids is",sub_noMatch3KeepFirst['assignee_id'].nunique())
print("The number of unique IDs is",sub_noMatch3KeepFirst['ID'].nunique(),'\n')

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(sub_noMatch3KeepFirst.info(),sub_noMatch3KeepFirst.head())

The number of unique assignee_ids is 23
The number of unique IDs is 23 

Total time is 0.000100 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   23 non-null     float64       
 1   assignee_id          23 non-null     object        
 2   location_id          23 non-null     object        
 3   organization         23 non-null     object        
 4   city                 23 non-null     object        
 5   state                23 non-null     object        
 6   dateOfFirstPat       23 non-null     object        
 7   serial               1 non-null      object        
 8   tradeCity            1 non-null      object        
 9   tradeState           1 non-null      object        
 10  ctrlEntity           1 non-null      object        
 11  nameScores           23 non-null     float64      

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,minIncDateForOrg,minIncDateForOrgLoc,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state,match
0,139435.0,9ccbcfb4-7956-4b86-9940-540b4c9580f8,f96393df-cb8e-11eb-9615-121df0c29c1e,Avatar Merger Sub Ii,Minneapolis,Mn,06/25/2014,,,,...,03/27/2015,2016-04-01,branch of an out-of-jurisdiction company,Venice,Ca,,,,,0.0
1,166851.0,7e4dcef7-dad3-446c-a22f-28bcf5813bcb,9d0d36fb-cb8e-11eb-9615-121df0c29c1e,Axiom International Group,Gardnerville,Nv,03/09/2004,,,,...,02/15/2002,2002-05-10,branch of an out-of-jurisdiction company,Hollister,,,,,,0.0
2,8632.0,abc2280b-bbad-4e1c-bd95-0388a3dd6f53,f4effe5f-cb8f-11eb-9615-121df0c29c1e,Blacklight Power,Cranbury,Nj,03/21/1997,,,,...,12/02/1991,1991-12-02,branch of an out-of-jurisdiction company,,,,,,,0.0
3,138003.0,9e574fa8-b207-4e89-aaf9-302a6bffd046,eaae643d-09be-11ec-893a-12de62d610b1,Dermisonics,West Reading,Pa,01/16/2003,,,,...,02/02/2005,2005-02-02,branch of an out-of-jurisdiction company,Irvine,Ca,,,,,0.0
4,66609.0,edc8991f-db93-452d-a2b2-74f19b7254c1,70867e08-cb8e-11eb-9615-121df0c29c1e,Electric Sheep Robotics,Alexandria,Va,08/10/2020,,,,...,04/05/2019,2020-04-03,branch of an out-of-jurisdiction company,Manteca,Ca,Sacramento,Ca,,,0.0


In [37]:
### start timer
t0=time.time()

### combine this new data with the previously concatenated data set and sort the records via ID
combineHomeOrgs=pd.concat([sub_noMatch3KeepFirst,mergeExactToStateRmDups],axis=0).sort_values(by=['ID']).reset_index(drop=True)

### count the number of unique assignee_ids and IDs in the data set
print("The number of unique assignee_ids is",combineHomeOrgs['assignee_id'].nunique())
print("The number of unique IDs is",combineHomeOrgs['ID'].nunique(),'\n')

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(combineHomeOrgs.info(),combineHomeOrgs.head())  ### ready to concat

The number of unique assignee_ids is 138
The number of unique IDs is 138 

Total time is 0.000166 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138 entries, 0 to 137
Data columns (total 26 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   138 non-null    float64       
 1   assignee_id          138 non-null    object        
 2   location_id          138 non-null    object        
 3   organization         138 non-null    object        
 4   city                 138 non-null    object        
 5   state                138 non-null    object        
 6   dateOfFirstPat       138 non-null    object        
 7   serial               29 non-null     object        
 8   tradeCity            29 non-null     object        
 9   tradeState           29 non-null     object        
 10  ctrlEntity           29 non-null     object        
 11  nameScores           138 non-null    float64  

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,minIncDateForOrgLoc,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state,match,cityMatch
0,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,2019-01-08,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,,,1.0
1,2729.0,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,01/15/2014,,,,...,2016-09-02,branch of an out-of-jurisdiction company,,,,,,Dc,,0.0
2,3756.0,dbd08974-b2f8-4753-b720-27b2fdff8b65,419babc4-cb8e-11eb-9615-121df0c29c1e,Pendpac,Fairview,Ok,10/23/2001,,,,...,2003-08-07,branch of an out-of-jurisdiction company,,,,,,,,0.0
3,6626.0,bf924c03-bf8a-4fef-b6e7-6b54e3faad33,ce35392a-cb8e-11eb-9615-121df0c29c1e,Atx Telecom Systems,Naperville,Il,12/22/1995,,,,...,1995-03-27,branch of an out-of-jurisdiction company,Naperville,Il,,,,Il,,1.0
4,8632.0,abc2280b-bbad-4e1c-bd95-0388a3dd6f53,f4effe5f-cb8f-11eb-9615-121df0c29c1e,Blacklight Power,Cranbury,Nj,03/21/1997,,,,...,1991-12-02,branch of an out-of-jurisdiction company,,,,,,,0.0,


# Process Records with No Home Controlling Entities

In [38]:
### start timer
t0=time.time()

### create data frames to remove IDs and organizations from the data that did not have a controlling
### entity in the input data
combineHomeOrgsIDs=combineHomeOrgs.iloc[:,0:1]
combineHomeOrgsNames=combineHomeOrgs.iloc[:,3:4]

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(combineHomeOrgsIDs.head(),combineHomeOrgsNames.head())

Total time is 0.000000 mins


Unnamed: 0,ID
0,1284.0
1,2729.0
2,3756.0
3,6626.0
4,8632.0


Unnamed: 0,organization
0,Tank Vision
1,Fanamana
2,Pendpac
3,Atx Telecom Systems
4,Blacklight Power


In [39]:
### start timer
t0=time.time()

### merge the two data sets against addedMinIncDates data set and select the records in the
### left_only side of the merge
addedMinIncDatesRmIDs=addedMinIncDates.merge(combineHomeOrgsIDs,on=['ID'],how='outer',indicator=True)
addedMinIncDatesRmIDs1=addedMinIncDatesRmIDs.loc[addedMinIncDatesRmIDs['_merge']=='left_only'].iloc[:,:24]

addedMinIncDatesRmIDsOrgs=addedMinIncDatesRmIDs1.merge(combineHomeOrgsNames,on=['organization'],how='outer',indicator=True)
addedMinIncDatesRmIDsOrgs1=addedMinIncDatesRmIDsOrgs.loc[addedMinIncDatesRmIDsOrgs['_merge']=='left_only'].iloc[:,:24]

### count the number of unique assignee_ids and IDs in the data set
print("The number of unique assignee_ids is",addedMinIncDatesRmIDsOrgs1['assignee_id'].nunique())
print("The number of unique IDs is",addedMinIncDatesRmIDsOrgs1['ID'].nunique(),'\n')

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(addedMinIncDatesRmIDsOrgs1.info(),addedMinIncDatesRmIDsOrgs1.head())

The number of unique assignee_ids is 240
The number of unique IDs is 326 

Total time is 0.000382 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 900 entries, 0 to 1374
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   900 non-null    float64       
 1   assignee_id          900 non-null    object        
 2   location_id          900 non-null    object        
 3   organization         900 non-null    object        
 4   city                 900 non-null    object        
 5   state                900 non-null    object        
 6   dateOfFirstPat       900 non-null    object        
 7   serial               31 non-null     object        
 8   tradeCity            31 non-null     object        
 9   tradeState           31 non-null     object        
 10  ctrlEntity           31 non-null     object        
 11  nameScores           900 non-null    float64 

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,subCntlEntity,minIncDateForOrg,minIncDateForOrgLoc,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,,12/22/1997,1997-12-22,,Roanoke,Va,Roanoke,Va,,
1,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,12/28/2012,,,,...,,01/31/2013,2013-01-31,,,,,,,
2,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,10/23/1980,,,,...,,05/01/1958,1958-05-01,,Las Vegas,Nv,,,Las Vegas,Nv
3,2820.0,e47bd68a-a118-4959-a81e-4ce6bb565b1c,f802c426-cb90-11eb-9615-121df0c29c1e,US Wind Farming,Abbott Park,Il,11/03/2008,,,,...,,02/10/2004,2004-02-10,,,,Las Vegas,Nv,,
4,3679.0,dc81aba7-c332-4ae9-bd0e-c84dc9ab9d9d,d9cd82bc-cb8e-11eb-9615-121df0c29c1e,Children'S Hosptial,Columbus,Oh,07/19/2012,,,,...,,06/06/1901,1901-06-06,,,,,,,


In [40]:
### start timer
t0=time.time()

### identify records where the state and subJurisCode states are the same by coding them
### 0 (no match) or 1 (match)
addedMinIncDatesRmIDsOrgs1['match']=np.where(addedMinIncDatesRmIDsOrgs1['state']==addedMinIncDatesRmIDsOrgs1['subJurisCode'],
                                             1,0)

### select records coded with a 1 and fuzzy match greater than 89 then sort the values
match1=addedMinIncDatesRmIDsOrgs1.loc[(addedMinIncDatesRmIDsOrgs1['match']==1) & 
                                      (addedMinIncDatesRmIDsOrgs1['nameScores']>89)]
match1a=match1.sort_values(by=['ID','nameScores','minIncDateForOrgLoc'],
                           ascending=[True,False,True])

### if duplicates exist, select the first record using the ID and organization features
match2=match1a.drop_duplicates(subset=['ID'],keep='first')
match3=match2.drop_duplicates(subset=['organization'],keep='first') ### ready to concat

### count the number of unique assignee_ids and IDs in the data set
print("The number of unique assignee_ids is",match3['assignee_id'].nunique())
print("The number of unique IDs is",match3['ID'].nunique(),'\n')

### create data sets containing the IDs and organization identified to have a match during this step
match1IDs=match3.iloc[:,0:1]
match1Orgs=match3.iloc[:,3:4]

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(match1IDs.head(),match1Orgs.head())

The number of unique assignee_ids is 58
The number of unique IDs is 58 

Total time is 0.000183 mins


Unnamed: 0,ID
0,875.0
11,6434.0
39,12615.0
45,15885.0
57,20020.0


Unnamed: 0,organization
0,The Egg Factory
11,Lindsay/Barnett
39,Georight Industries
45,Sky Trax
57,Drais Pharmaceuticals


In [41]:
### start timer
t0=time.time()

### using the data sets created above, remove the records containing the IDs and organizations from the
### remaining records without a match
match1RmID=addedMinIncDatesRmIDsOrgs1.merge(match1IDs,on=['ID'],how='outer',indicator=True)
match1RmID1=match1RmID.loc[match1RmID['_merge']=='left_only'].iloc[:,:25]

match1RmIDOrg=match1RmID1.merge(match1Orgs,on=['organization'],how='outer',indicator=True)
match1RmIDOrg1=match1RmIDOrg.loc[match1RmIDOrg['_merge']=='left_only'].iloc[:,:25].reset_index(drop=True)

### count the number of unique assignee_ids and IDs in the data set
print("The number of unique assignee_ids is",match1RmIDOrg1['assignee_id'].nunique())
print("The number of unique IDs is",match1RmIDOrg1['ID'].nunique(),'\n')

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(match1RmIDOrg1.info(),match1RmIDOrg1.head())

The number of unique assignee_ids is 182
The number of unique IDs is 253 

Total time is 0.000349 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 713 entries, 0 to 712
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   713 non-null    float64       
 1   assignee_id          713 non-null    object        
 2   location_id          713 non-null    object        
 3   organization         713 non-null    object        
 4   city                 713 non-null    object        
 5   state                713 non-null    object        
 6   dateOfFirstPat       713 non-null    object        
 7   serial               29 non-null     object        
 8   tradeCity            29 non-null     object        
 9   tradeState           29 non-null     object        
 10  ctrlEntity           29 non-null     object        
 11  nameScores           713 non-null    float64  

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,minIncDateForOrg,minIncDateForOrgLoc,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state,match
0,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,12/28/2012,,,,...,01/31/2013,2013-01-31,,,,,,,,0.0
1,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,10/23/1980,,,,...,05/01/1958,1958-05-01,,Las Vegas,Nv,,,Las Vegas,Nv,0.0
2,2820.0,e47bd68a-a118-4959-a81e-4ce6bb565b1c,f802c426-cb90-11eb-9615-121df0c29c1e,US Wind Farming,Abbott Park,Il,11/03/2008,,,,...,02/10/2004,2004-02-10,,,,Las Vegas,Nv,,,0.0
3,3679.0,dc81aba7-c332-4ae9-bd0e-c84dc9ab9d9d,d9cd82bc-cb8e-11eb-9615-121df0c29c1e,Children'S Hosptial,Columbus,Oh,07/19/2012,,,,...,06/06/1901,1901-06-06,,,,,,,,0.0
4,3679.0,dc81aba7-c332-4ae9-bd0e-c84dc9ab9d9d,d9cd82bc-cb8e-11eb-9615-121df0c29c1e,Children'S Hosptial,Columbus,Oh,07/19/2012,,,,...,07/01/1969,1969-07-01,,,,Louisville,Ky,,,0.0


In [42]:
### start timer
t0=time.time()

### identify records and code as 1 if the state matches address_state (agent or data) and city matches
### address_city (agent or data); all records with no match is coded with a 0
match1RmIDOrg1['match']=np.where((match1RmIDOrg1['state']==match1RmIDOrg1['address_state']) & 
                                 (match1RmIDOrg1['city']==match1RmIDOrg1['address_city']) &
                                 (match1RmIDOrg1['nameScores']==100),1,
                        np.where((match1RmIDOrg1['state']==match1RmIDOrg1['agent_state']) & 
                                 (match1RmIDOrg1['city']==match1RmIDOrg1['agent_city']) &
                                 (match1RmIDOrg1['nameScores']==100),1,
                        np.where((match1RmIDOrg1['state']==match1RmIDOrg1['data_state']) & 
                                 (match1RmIDOrg1['city']==match1RmIDOrg1['data_city'])&
                                 (match1RmIDOrg1['nameScores']==100),1,0)))

### sort by the new coded feature and existing columns
match1RmIDOrg2=match1RmIDOrg1.sort_values(by=['match','ID','organization','nameScores',
                                               'minIncDateForOrg','minIncDateForOrgLoc'],
                                           ascending=[False,True,True,False,True,True])

### after sorting, drop duplicates and keep the first record using the ID and organization columns
match1RmIDOrg3=match1RmIDOrg2.drop_duplicates(subset=['ID'],keep='first')
match1RmIDOrg4=match1RmIDOrg3.drop_duplicates(subset=['organization'],keep='first').iloc[:,:24].reset_index(drop=True)

### count the number of unique assignee_ids and IDs in the data set
print("The number of unique assignee_ids is",match1RmIDOrg4['assignee_id'].nunique())
print("The number of unique IDs is",match1RmIDOrg4['ID'].nunique(),'\n')

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(match1RmIDOrg4.info(),match1RmIDOrg4.head())

The number of unique assignee_ids is 182
The number of unique IDs is 182 

Total time is 0.000216 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182 entries, 0 to 181
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   182 non-null    float64       
 1   assignee_id          182 non-null    object        
 2   location_id          182 non-null    object        
 3   organization         182 non-null    object        
 4   city                 182 non-null    object        
 5   state                182 non-null    object        
 6   dateOfFirstPat       182 non-null    object        
 7   serial               6 non-null      object        
 8   tradeCity            6 non-null      object        
 9   tradeState           6 non-null      object        
 10  ctrlEntity           6 non-null      object        
 11  nameScores           182 non-null    float64  

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,subCntlEntity,minIncDateForOrg,minIncDateForOrgLoc,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,10/23/1980,,,,...,,05/01/1958,1958-05-01,,Las Vegas,Nv,,,Las Vegas,Nv
1,99203.0,c9a20e63-6ebe-48ed-83c0-1ffd503d1fc7,aff40e27-cb8e-11eb-9615-121df0c29c1e,Countrue,El Paso,Tx,11/10/2016,,,,...,,09/07/2012,2012-09-07,,El Paso,Tx,Spokane,Wa,El Paso,Tx
2,165894.0,7f4abbb7-b2cf-48a3-8f22-71733be030c6,ae04ea57-cb90-11eb-9615-121df0c29c1e,Enduro Systems,Omaha,Ne,07/14/1978,[73005630 73228945 73396573 73419193 74111781 ...,Houston,Tx,...,,03/16/1977,1977-03-16,branch of an out-of-jurisdiction company,Omaha,Ne,,,,
3,231604.0,33a3c840-d967-4f9b-88c8-235c347ac328,c3caa9fa-cb8e-11eb-9615-121df0c29c1e,Organistry,Fairfield,Nj,05/04/2012,,,,...,,07/07/2014,2014-07-07,,Fairfield,Nj,Austin,Tx,Fairfield,Nj
4,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,12/28/2012,,,,...,,01/31/2013,2013-01-31,,,,,,,


In [43]:
### start timer
t0=time.time()

### combine data frames into a single data set
dfFinal=pd.concat([combineHomeOrgs,match3,match1RmIDOrg4],axis=0).sort_values(by=['ID']).iloc[:,:24].reset_index(drop=True)
dfFinal['minIncDateForOrgLoc']=dfFinal['minIncDateForOrgLoc'].dt.strftime('%m/%d/%Y')

### count the number of unique assignee_ids and IDs in the data set
print("The number of unique assignee_ids is",dfFinal['assignee_id'].nunique())
print("The number of unique IDs is",dfFinal['ID'].nunique(),'\n')

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dfFinal.info(),dfFinal.head())

The number of unique assignee_ids is 378
The number of unique IDs is 378 

Total time is 0.000216 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   378 non-null    float64
 1   assignee_id          378 non-null    object 
 2   location_id          378 non-null    object 
 3   organization         378 non-null    object 
 4   city                 378 non-null    object 
 5   state                378 non-null    object 
 6   dateOfFirstPat       378 non-null    object 
 7   serial               36 non-null     object 
 8   tradeCity            36 non-null     object 
 9   tradeState           36 non-null     object 
 10  ctrlEntity           36 non-null     object 
 11  nameScores           378 non-null    float64
 12  matchNames           378 non-null    object 
 13  subJurisCode         378 non-null    

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,subCntlEntity,minIncDateForOrg,minIncDateForOrgLoc,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,,12/22/1997,12/22/1997,,Roanoke,Va,Roanoke,Va,,
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,01/03/2017,01/08/2019,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,
2,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,12/28/2012,,,,...,,01/31/2013,01/31/2013,,,,,,,
3,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,10/23/1980,,,,...,,05/01/1958,05/01/1958,,Las Vegas,Nv,,,Las Vegas,Nv
4,2729.0,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,01/15/2014,,,,...,De,01/02/2013,09/02/2016,branch of an out-of-jurisdiction company,,,,,,Dc


In [44]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000WithUniqueAssigneeIDs.csv"
a_full = os.path.join(res_folder,outpt_file)

# dfFinal.to_csv(a_full,index=False)

# Add Coordinates to Locations

In [45]:
### start timer
t0=time.time()

### set the path for the input file and save to variable
res_folder = "../sourceFiles/"
input_file = "location_suppLatLong1.tsv"
a_full=os.path.join(res_folder,input_file)
print(a_full,"\n")

latLong=pd.read_csv(a_full,sep="\t").iloc[:,[1,2,3,5,6]]

### capitalize the first letter in each string for the city and state
latLong['city']=latLong['city'].str.title()
latLong['state']=latLong['state'].str.title()

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(latLong.info(null_counts=True),latLong.head())

../sourceFiles/location_suppLatLong1.tsv 

Total time is 0.049371 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32945 entries, 0 to 32944
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   location_id  32945 non-null  object 
 1   city         32945 non-null  object 
 2   state        32945 non-null  object 
 3   latitude     32945 non-null  float64
 4   longitude    32945 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.3+ MB


None

Unnamed: 0,location_id,city,state,latitude,longitude
0,00006da3-cb90-11eb-9615-121df0c29c1e,Alder,Mt,45.3247,-112.108
1,00047c6a-cb91-11eb-9615-121df0c29c1e,Knowles,Ok,36.8734,-100.193
2,0005c1ab-cb8f-11eb-9615-121df0c29c1e,Court Florency,Ky,38.0393,-84.4862
3,00171e6a-cb90-11eb-9615-121df0c29c1e,Mount Herman,Nj,39.6176,-74.5943
4,001ee951-cb91-11eb-9615-121df0c29c1e,Watauga County,Nc,36.2514,-81.7044


In [46]:
### start timer
t0=time.time()

### set the path for the input file and save to variable
res_folder = "../csvResults/"
input_file = "OcResults1000WithUniqueAssigneeIDs.csv"
a_full=os.path.join(res_folder,input_file)
print(a_full,"\n")

aCheckFinal=pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset 
display(aCheckFinal.info(null_counts=True),aCheckFinal.head())

../csvResults/OcResults1000WithUniqueAssigneeIDs.csv 

Total time is 0.037069 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   378 non-null    float64
 1   assignee_id          378 non-null    object 
 2   location_id          378 non-null    object 
 3   organization         378 non-null    object 
 4   city                 378 non-null    object 
 5   state                378 non-null    object 
 6   dateOfFirstPat       378 non-null    object 
 7   serial               36 non-null     object 
 8   tradeCity            36 non-null     object 
 9   tradeState           36 non-null     object 
 10  ctrlEntity           36 non-null     object 
 11  nameScores           378 non-null    float64
 12  matchNames           378 non-null    object 
 13  subJurisCode         378 non-null    object 
 14  subCntl

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,subCntlEntity,minIncDateForOrg,minIncDateForOrgLoc,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,,12/22/1997,12/22/1997,,Roanoke,Va,Roanoke,Va,,
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,01/03/2017,01/08/2019,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,
2,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,12/28/2012,,,,...,,01/31/2013,01/31/2013,,,,,,,
3,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,10/23/1980,,,,...,,05/01/1958,05/01/1958,,Las Vegas,Nv,,,Las Vegas,Nv
4,2729.0,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,01/15/2014,,,,...,De,01/02/2013,09/02/2016,branch of an out-of-jurisdiction company,,,,,,Dc


In [47]:
### start timer
t0=time.time()

### perform an outer merge to attach coordinates with the assigneeScores7 dataframe using the
### location_id, city, and state features as merging columns
cityLatLong=aCheckFinal.merge(latLong,on=['location_id','city','state'],how='outer',indicator=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(cityLatLong.info(),cityLatLong.head())

Total time is 0.001197 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33047 entries, 0 to 33046
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ID                   378 non-null    float64 
 1   assignee_id          378 non-null    object  
 2   location_id          33047 non-null  object  
 3   organization         378 non-null    object  
 4   city                 33047 non-null  object  
 5   state                33047 non-null  object  
 6   dateOfFirstPat       378 non-null    object  
 7   serial               36 non-null     object  
 8   tradeCity            36 non-null     object  
 9   tradeState           36 non-null     object  
 10  ctrlEntity           36 non-null     object  
 11  nameScores           378 non-null    float64 
 12  matchNames           378 non-null    object  
 13  subJurisCode         378 non-null    object  
 14  subCntlEntity        143 non-null    objec

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state,latitude,longitude,_merge
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,,Roanoke,Va,Roanoke,Va,,,37.2738,-79.9602,both
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,,41.4543,-70.6038,both
2,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,12/28/2012,,,,...,,,,,,,,42.2187,-71.2026,both
3,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,10/23/1980,,,,...,,Las Vegas,Nv,,,Las Vegas,Nv,36.1716,-115.1391,both
4,201692.0,576d9fad-23a2-444b-b1af-6adc4382a9af,f97aecf0-cb90-11eb-9615-121df0c29c1e,Dp Stud,Las Vegas,Nv,01/31/1997,,,,...,,,,,,,,36.1716,-115.1391,both


In [48]:
### start timer
t0=time.time()

### there was not a 1 to 1 match when setting the how parameter to inner in the merge function in the
### above cell. Therefore, the outer option was used for this parameter and the _merge column was
### used to select the records designated as both or left_only. These selections were concatenated
### to generate the cityLatLong dataset, but with the coordinates for the patentsview city feature
### added to the data
cityLatLong1=pd.concat([cityLatLong.loc[cityLatLong['_merge']=='both'],cityLatLong.loc[cityLatLong['_merge']=='left_only']],
                       axis=0)

### columns were selected for further processing and the lat/long names were renamed
cityLatLong2=cityLatLong1.iloc[:,[0,1,2,3,4,5,24,25,6,7,8,9,10,11,12,13,14,15,16,
                                  18,19,20,21,22,23]].sort_values(by=['ID']).reset_index(drop=True)
cityLatLong2['ID']=cityLatLong2['ID'].astype('int')
cityLatLong2.rename(columns={'latitude':'city_latitude','longitude':'city_longitude'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(cityLatLong2.info(),cityLatLong2.head())

Total time is 0.000166 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378 entries, 0 to 377
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   378 non-null    int32  
 1   assignee_id          378 non-null    object 
 2   location_id          378 non-null    object 
 3   organization         378 non-null    object 
 4   city                 378 non-null    object 
 5   state                378 non-null    object 
 6   city_latitude        374 non-null    float64
 7   city_longitude       374 non-null    float64
 8   dateOfFirstPat       378 non-null    object 
 9   serial               36 non-null     object 
 10  tradeCity            36 non-null     object 
 11  tradeState           36 non-null     object 
 12  ctrlEntity           36 non-null     object 
 13  nameScores           378 non-null    float64
 14  matchNames           378 non-null    object 
 15  subJurisCode

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,subJurisCode,subCntlEntity,minIncDateForOrg,minIncDateForOrgLoc,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,01/20/1999,,...,Va,,12/22/1997,12/22/1997,Roanoke,Va,Roanoke,Va,,
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,01/12/2017,,...,Ri,De,01/03/2017,01/08/2019,Vineyard Haven,Ma,Providence,Ri,,
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,12/28/2012,,...,De,,01/31/2013,01/31/2013,,,,,,
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,Ca,,05/01/1958,05/01/1958,Las Vegas,Nv,,,Las Vegas,Nv
4,2729,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,38.8937,-76.9879,01/15/2014,,...,Tn,De,01/02/2013,09/02/2016,,,,,,Dc


In [49]:
### start timer
t0=time.time()

### perform an outer merge to attach coordinates with the cityLatLong2 dataframe using the
### city and state features as merging columns. This will add coordinates that correspond
### to the address_city/state features
addLatLong=cityLatLong2.merge(latLong,left_on=['address_city','address_state'],right_on=['city','state'],
                              how='outer',indicator=True)

### dropped features and renamed column names
addLatLong.drop(columns=['city_y','state_y','location_id_y'],inplace=True)
addLatLong.rename(columns={'city_x':'city','state_x':'state','location_id_x':'location_id'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(addLatLong.info(),addLatLong.head())

Total time is 0.001230 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33165 entries, 0 to 33164
Data columns (total 28 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ID                   379 non-null    float64 
 1   assignee_id          379 non-null    object  
 2   location_id          379 non-null    object  
 3   organization         379 non-null    object  
 4   city                 379 non-null    object  
 5   state                379 non-null    object  
 6   city_latitude        375 non-null    float64 
 7   city_longitude       375 non-null    float64 
 8   dateOfFirstPat       379 non-null    object  
 9   serial               36 non-null     object  
 10  tradeCity            36 non-null     object  
 11  tradeState           36 non-null     object  
 12  ctrlEntity           36 non-null     object  
 13  nameScores           379 non-null    float64 
 14  matchNames           379 non-null    objec

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,minIncDateForOrgLoc,address_city,address_state,agent_city,agent_state,data_city,data_state,latitude,longitude,_merge
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,01/20/1999,,...,12/22/1997,Roanoke,Va,Roanoke,Va,,,37.2738,-79.9602,both
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,01/12/2017,,...,01/08/2019,Vineyard Haven,Ma,Providence,Ri,,,41.4543,-70.6038,both
2,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,12/28/2012,,...,01/31/2013,,,,,,,,,left_only
3,2729.0,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,38.8937,-76.9879,01/15/2014,,...,09/02/2016,,,,,,Dc,,,left_only
4,2820.0,e47bd68a-a118-4959-a81e-4ce6bb565b1c,f802c426-cb90-11eb-9615-121df0c29c1e,US Wind Farming,Abbott Park,Il,41.6027,-87.8557,11/03/2008,,...,02/10/2004,,,Las Vegas,Nv,,,,,left_only


In [50]:
### start timer
t0=time.time()

### there was not a 1 to 1 match when setting the how parameter to inner in the merge function in the
### above cell. Therefore, the outer option was used for this parameter and the _merge column was
### used to select the records designated as both or left_only. These selections were concatenated
### to generate the cityLatLong dataset, but with the coordinates for the patentsview city feature
### added to the data
addLatLong1=pd.concat([addLatLong.loc[addLatLong['_merge']=='both'],addLatLong.loc[addLatLong['_merge']=='left_only']],
                       axis=0).sort_values(by=['ID']).reset_index(drop=True)

addLatLong2=addLatLong1.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,25,26,21,22,23,24]]
addLatLong2.rename(columns={'latitude':'address_latitude','longitude':'address_longitude'},inplace=True)

addLatLong2['ID']=addLatLong2['ID'].astype(int)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(addLatLong2.info(),addLatLong2.head())

Total time is 0.000133 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 379 entries, 0 to 378
Data columns (total 27 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   379 non-null    int32  
 1   assignee_id          379 non-null    object 
 2   location_id          379 non-null    object 
 3   organization         379 non-null    object 
 4   city                 379 non-null    object 
 5   state                379 non-null    object 
 6   city_latitude        375 non-null    float64
 7   city_longitude       375 non-null    float64
 8   dateOfFirstPat       379 non-null    object 
 9   serial               36 non-null     object 
 10  tradeCity            36 non-null     object 
 11  tradeState           36 non-null     object 
 12  ctrlEntity           36 non-null     object 
 13  nameScores           379 non-null    float64
 14  matchNames           379 non-null    object 
 15  subJurisCode

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,minIncDateForOrg,minIncDateForOrgLoc,address_city,address_state,address_latitude,address_longitude,agent_city,agent_state,data_city,data_state
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,01/20/1999,,...,12/22/1997,12/22/1997,Roanoke,Va,37.2738,-79.9602,Roanoke,Va,,
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,01/12/2017,,...,01/03/2017,01/08/2019,Vineyard Haven,Ma,41.4543,-70.6038,Providence,Ri,,
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,12/28/2012,,...,01/31/2013,01/31/2013,,,,,,,,
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,05/01/1958,05/01/1958,Las Vegas,Nv,36.1716,-115.1391,,,Las Vegas,Nv
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,05/01/1958,05/01/1958,Las Vegas,Nv,36.1716,-115.1391,,,Las Vegas,Nv


In [51]:
### start timer
t0=time.time()

### perform an outer merge to attach coordinates with the addLatLong2 dataframe using the
### city and state features as merging columns. This will add coordinates that correspond
### to the agent_city/state features
agtLatLong=addLatLong2.merge(latLong,left_on=['agent_city','agent_state'],right_on=['city','state'],
                              how='outer',indicator=True)

### dropped features and renamed column names
agtLatLong.drop(columns=['city_y','state_y','location_id_y'],inplace=True)
agtLatLong.rename(columns={'city_x':'city','state_x':'state','location_id_x':'location_id'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(agtLatLong.info(),agtLatLong.head())

Total time is 0.001479 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33286 entries, 0 to 33285
Data columns (total 30 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ID                   382 non-null    float64 
 1   assignee_id          382 non-null    object  
 2   location_id          382 non-null    object  
 3   organization         382 non-null    object  
 4   city                 382 non-null    object  
 5   state                382 non-null    object  
 6   city_latitude        378 non-null    float64 
 7   city_longitude       378 non-null    float64 
 8   dateOfFirstPat       382 non-null    object  
 9   serial               36 non-null     object  
 10  tradeCity            36 non-null     object  
 11  tradeState           36 non-null     object  
 12  ctrlEntity           36 non-null     object  
 13  nameScores           382 non-null    float64 
 14  matchNames           382 non-null    objec

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,address_state,address_latitude,address_longitude,agent_city,agent_state,data_city,data_state,latitude,longitude,_merge
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,01/20/1999,,...,Va,37.2738,-79.9602,Roanoke,Va,,,37.2738,-79.9602,both
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,01/12/2017,,...,Ma,41.4543,-70.6038,Providence,Ri,,,41.8171,-71.4282,both
2,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,12/28/2012,,...,,,,,,,,,,left_only
3,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,Nv,36.1716,-115.1391,,,Las Vegas,Nv,,,left_only
4,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,Nv,36.1716,-115.1391,,,Las Vegas,Nv,,,left_only


In [52]:
### start timer
t0=time.time()

### there was not a 1 to 1 match when setting the how parameter to inner in the merge function in the
### above cell. Therefore, the outer option was used for this parameter and the _merge column was
### used to select the records designated as both or left_only. These selections were concatenated
### to generate the cityLatLong dataset, but with the coordinates for the patentsview city feature
### added to the data
agtLatLong1=pd.concat([agtLatLong.loc[agtLatLong['_merge']=='both'],agtLatLong.loc[agtLatLong['_merge']=='left_only']],
                       axis=0).sort_values(by=['ID']).reset_index(drop=True)

### dropped features and renamed column names
agtLatLong2=agtLatLong1.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,27,28,25,26]]
agtLatLong2.rename(columns={'latitude':'agent_latitude','longitude':'agent_longitude'},inplace=True)

agtLatLong2['ID']=agtLatLong2['ID'].astype(int)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(agtLatLong2.info(),agtLatLong2.head())

Total time is 0.000116 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 382 entries, 0 to 381
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   382 non-null    int32  
 1   assignee_id          382 non-null    object 
 2   location_id          382 non-null    object 
 3   organization         382 non-null    object 
 4   city                 382 non-null    object 
 5   state                382 non-null    object 
 6   city_latitude        378 non-null    float64
 7   city_longitude       378 non-null    float64
 8   dateOfFirstPat       382 non-null    object 
 9   serial               36 non-null     object 
 10  tradeCity            36 non-null     object 
 11  tradeState           36 non-null     object 
 12  ctrlEntity           36 non-null     object 
 13  nameScores           382 non-null    float64
 14  matchNames           382 non-null    object 
 15  subJurisCode

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,address_city,address_state,address_latitude,address_longitude,agent_city,agent_state,agent_latitude,agent_longitude,data_city,data_state
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,01/20/1999,,...,Roanoke,Va,37.2738,-79.9602,Roanoke,Va,37.2738,-79.9602,,
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,01/12/2017,,...,Vineyard Haven,Ma,41.4543,-70.6038,Providence,Ri,41.8171,-71.4282,,
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,12/28/2012,,...,,,,,,,,,,
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,Las Vegas,Nv,36.1716,-115.1391,,,,,Las Vegas,Nv
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,Las Vegas,Nv,36.1716,-115.1391,,,,,Las Vegas,Nv


In [53]:
### start timer
t0=time.time()

### perform an outer merge to attach coordinates with the agtLatLong2 dataframe using the
### city and state features as merging columns. This will add coordinates that correspond
### to the data_city/state features
dataLatLong=agtLatLong2.merge(latLong,left_on=['data_city','data_state'],right_on=['city','state'],
                               how='outer',indicator=True)

### dropped features and renamed column names
dataLatLong.drop(columns=['city_y','state_y','location_id_y'],inplace=True)
dataLatLong.rename(columns={'city_x':'city','state_x':'state','location_id_x':'location_id'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong.info(),dataLatLong.head())

Total time is 0.001330 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 33255 entries, 0 to 33254
Data columns (total 32 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   ID                   384 non-null    float64 
 1   assignee_id          384 non-null    object  
 2   location_id          384 non-null    object  
 3   organization         384 non-null    object  
 4   city                 384 non-null    object  
 5   state                384 non-null    object  
 6   city_latitude        380 non-null    float64 
 7   city_longitude       380 non-null    float64 
 8   dateOfFirstPat       384 non-null    object  
 9   serial               36 non-null     object  
 10  tradeCity            36 non-null     object  
 11  tradeState           36 non-null     object  
 12  ctrlEntity           36 non-null     object  
 13  nameScores           384 non-null    float64 
 14  matchNames           384 non-null    objec

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,address_longitude,agent_city,agent_state,agent_latitude,agent_longitude,data_city,data_state,latitude,longitude,_merge
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,01/20/1999,,...,-79.9602,Roanoke,Va,37.2738,-79.9602,,,,,left_only
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,01/12/2017,,...,-70.6038,Providence,Ri,41.8171,-71.4282,,,,,left_only
2,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,12/28/2012,,...,,,,,,,,,,left_only
3,2820.0,e47bd68a-a118-4959-a81e-4ce6bb565b1c,f802c426-cb90-11eb-9615-121df0c29c1e,US Wind Farming,Abbott Park,Il,41.6027,-87.8557,11/03/2008,,...,,Las Vegas,Nv,36.1716,-115.1391,,,,,left_only
4,2820.0,e47bd68a-a118-4959-a81e-4ce6bb565b1c,f802c426-cb90-11eb-9615-121df0c29c1e,US Wind Farming,Abbott Park,Il,41.6027,-87.8557,11/03/2008,,...,,Las Vegas,Nv,36.1716,-115.1391,,,,,left_only


In [54]:
### start timer
t0=time.time()

### there was not a 1 to 1 match when setting the how parameter to inner in the merge function in the
### above cell. Therefore, the outer option was used for this parameter and the _merge column was
### used to select the records designated as both or left_only. These selections were concatenated
### to generate the cityLatLong dataset, but with the coordinates for the patentsview city feature
### added to the data
dataLatLong1=pd.concat([dataLatLong.loc[dataLatLong['_merge']=='both'],dataLatLong.loc[dataLatLong['_merge']=='left_only']],
                        axis=0).sort_values(by=['ID']).reset_index(drop=True)

### dropped features and renamed column names
dataLatLong2=dataLatLong1.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30]]
dataLatLong2.rename(columns={'latitude':'data_latitude','longitude':'data_longitude'},inplace=True)

dataLatLong2['ID']=dataLatLong2['ID'].astype(int)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong2.info(),dataLatLong2.head())

Total time is 0.000116 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   384 non-null    int32  
 1   assignee_id          384 non-null    object 
 2   location_id          384 non-null    object 
 3   organization         384 non-null    object 
 4   city                 384 non-null    object 
 5   state                384 non-null    object 
 6   city_latitude        380 non-null    float64
 7   city_longitude       380 non-null    float64
 8   dateOfFirstPat       384 non-null    object 
 9   serial               36 non-null     object 
 10  tradeCity            36 non-null     object 
 11  tradeState           36 non-null     object 
 12  ctrlEntity           36 non-null     object 
 13  nameScores           384 non-null    float64
 14  matchNames           384 non-null    object 
 15  subJurisCode

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,address_latitude,address_longitude,agent_city,agent_state,agent_latitude,agent_longitude,data_city,data_state,data_latitude,data_longitude
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,01/20/1999,,...,37.2738,-79.9602,Roanoke,Va,37.2738,-79.9602,,,,
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,01/12/2017,,...,41.4543,-70.6038,Providence,Ri,41.8171,-71.4282,,,,
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,12/28/2012,,...,,,,,,,,,,
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,36.1716,-115.1391,,,,,Las Vegas,Nv,36.1716,-115.1391
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,36.1716,-115.1391,,,,,Las Vegas,Nv,36.1716,-115.1391


In [55]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000WithUniqueAssigneeIDsAddCoor.csv"
a_full = os.path.join(res_folder,outpt_file)

# dataLatLong2.to_csv(a_full,index=False)

# Calculate Distances Between PatentsView Cities and OpenCorporate Cities

In [57]:
### import the required library to calculate distances
from geopy import distance

### start timer
t0=time.time()

### the distance between cities are calculated by utilizing the city_lat/city_long and the add_lat/add_long,
### agt_lat/agt_long, and data_lat/data_long features.
cityAddrCor=[]
cityAgtCor=[]
cityDataCor=[]

### each for loop below calculates the distance between the city found in PatentsView and the
### cities from the address_city, agent_city, and data_city. The try/except is included to deal
### with exception errors
for p in range(len(dataLatLong2)):
    
    try:
        cityCor = (dataLatLong2.iloc[p,6], dataLatLong2.iloc[p,7])
        cityAddressCor = (dataLatLong2.iloc[p,21], dataLatLong2.iloc[p,22])

        cityAddrCor.append(distance.distance(cityCor, cityAddressCor).miles)
    
    except:
        cityAddrCor.append(np.nan)
        
for p in range(len(dataLatLong2)):
    
    try:
        cityCor = (dataLatLong2.iloc[p,6], dataLatLong2.iloc[p,7])
        cityAgentCor = (dataLatLong2.iloc[p,25], dataLatLong2.iloc[p,26])

        cityAgtCor.append(distance.distance(cityCor, cityAgentCor).miles)
    
    except:
        cityAgtCor.append(np.nan)

for p in range(len(dataLatLong2)):
    
    try:
        cityCor = (dataLatLong2.iloc[p,6], dataLatLong2.iloc[p,7])
        cityDatCor = (dataLatLong2.iloc[p,29], dataLatLong2.iloc[p,30])

        cityDataCor.append(distance.distance(cityCor, cityDatCor).miles)
    
    except:
        cityDataCor.append(np.nan)
        
### add the distance to the input dataframe and round the value
cityAddrCor1=[round(num, 1) for num in cityAddrCor]
cityAgtCor1=[round(num1, 1) for num1 in cityAgtCor]
cityDataCor1=[round(num1, 1) for num1 in cityDataCor]

dataLatLong2['cityToAddrDistance'] = cityAddrCor1
dataLatLong2['cityToAgtDistance'] = cityAgtCor1
dataLatLong2['cityToDataDistance'] = cityDataCor1

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong2.info(),dataLatLong2.head())

Total time is 0.002676 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 34 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   384 non-null    int32  
 1   assignee_id          384 non-null    object 
 2   location_id          384 non-null    object 
 3   organization         384 non-null    object 
 4   city                 384 non-null    object 
 5   state                384 non-null    object 
 6   city_latitude        380 non-null    float64
 7   city_longitude       380 non-null    float64
 8   dateOfFirstPat       384 non-null    object 
 9   serial               36 non-null     object 
 10  tradeCity            36 non-null     object 
 11  tradeState           36 non-null     object 
 12  ctrlEntity           36 non-null     object 
 13  nameScores           384 non-null    float64
 14  matchNames           384 non-null    object 
 15  subJurisCode

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,agent_state,agent_latitude,agent_longitude,data_city,data_state,data_latitude,data_longitude,cityToAddrDistance,cityToAgtDistance,cityToDataDistance
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,01/20/1999,,...,Va,37.2738,-79.9602,,,,,0.0,0.0,
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,01/12/2017,,...,Ri,41.8171,-71.4282,,,,,0.0,49.5,
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,12/28/2012,,...,,,,,,,,,,
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,,,,Las Vegas,Nv,36.1716,-115.1391,0.0,,0.0
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,10/23/1980,,...,,,,Las Vegas,Nv,36.1716,-115.1391,0.0,,0.0


# Calculate Difference Between Incorporation and First Patent Granted Dates

In [58]:
### import the required library to calculate time differences
from datetime import timedelta

### start timer
t0=time.time()

### the time between the first patent granted and incorporation date is calculated and appended to the
### final dataset before scoring
dataLatLong2['dateOfFirstPat'] = pd.to_datetime(dataLatLong2['dateOfFirstPat'])
dataLatLong2['minIncDateForOrg'] = pd.to_datetime(dataLatLong2['minIncDateForOrg'],errors='coerce')

dataLatLong2['dateDiff']=abs(dataLatLong2['minIncDateForOrg'] - dataLatLong2['dateOfFirstPat'])/ timedelta(days=365)
dataLatLong2['dateDiff']=[round(num, 2) for num in dataLatLong2['dateDiff']]

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong2.info(),dataLatLong2.head())

Total time is 0.000050 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 384 entries, 0 to 383
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   384 non-null    int32         
 1   assignee_id          384 non-null    object        
 2   location_id          384 non-null    object        
 3   organization         384 non-null    object        
 4   city                 384 non-null    object        
 5   state                384 non-null    object        
 6   city_latitude        380 non-null    float64       
 7   city_longitude       380 non-null    float64       
 8   dateOfFirstPat       384 non-null    datetime64[ns]
 9   serial               36 non-null     object        
 10  tradeCity            36 non-null     object        
 11  tradeState           36 non-null     object        
 12  ctrlEntity           36 non-null     object        
 13  nameSco

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,agent_latitude,agent_longitude,data_city,data_state,data_latitude,data_longitude,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1999-01-20,,...,37.2738,-79.9602,,,,,0.0,0.0,,1.08
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2017-01-12,,...,41.8171,-71.4282,,,,,0.0,49.5,,0.02
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2012-12-28,,...,,,,,,,,,,0.09
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1980-10-23,,...,,,Las Vegas,Nv,36.1716,-115.1391,0.0,,0.0,22.5
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1980-10-23,,...,,,Las Vegas,Nv,36.1716,-115.1391,0.0,,0.0,22.5


# Find and Drop Duplicates

In [59]:
### start timer
t0=time.time()

dataLatLong3=dataLatLong2.drop_duplicates(subset=['assignee_id'],keep='first')

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

display(dataLatLong3.info(),dataLatLong3.head())

Total time is 0.000017 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 378 entries, 0 to 383
Data columns (total 35 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   378 non-null    int32         
 1   assignee_id          378 non-null    object        
 2   location_id          378 non-null    object        
 3   organization         378 non-null    object        
 4   city                 378 non-null    object        
 5   state                378 non-null    object        
 6   city_latitude        374 non-null    float64       
 7   city_longitude       374 non-null    float64       
 8   dateOfFirstPat       378 non-null    datetime64[ns]
 9   serial               36 non-null     object        
 10  tradeCity            36 non-null     object        
 11  tradeState           36 non-null     object        
 12  ctrlEntity           36 non-null     object        
 13  nameSco

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,serial,...,agent_latitude,agent_longitude,data_city,data_state,data_latitude,data_longitude,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1999-01-20,,...,37.2738,-79.9602,,,,,0.0,0.0,,1.08
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2017-01-12,,...,41.8171,-71.4282,,,,,0.0,49.5,,0.02
2,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2012-12-28,,...,,,,,,,,,,0.09
3,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1980-10-23,,...,,,Las Vegas,Nv,36.1716,-115.1391,0.0,,0.0,22.5
7,2729,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,38.8937,-76.9879,2014-01-15,,...,,,,Dc,,,,,,1.04


In [60]:
### determine the number of unique assignee_ids in the data
dataLatLong3['assignee_id'].nunique()

378

In [61]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000WithUniqueAssigneeIDsAddCoorPreparedForScoring.csv"
a_full = os.path.join(res_folder,outpt_file)

# dataLatLong3.to_csv(a_full,index=False)