# Prepare OpenCorporates Data with No States

The following code imports PatentsView data and the OpenCorporates API results to merge the two databases and identify organizations that can be found in both networks. OpenCorporates results in this script differ from previous versions because PatentsView states were not provided in the input file and any match between the two databases were matched only by organization name. Organization names in the OpenCorporates API results were cleaned to remove or standardize suffixes to facilitate better merge results between PatentsView and the OpenCorporates API results. This merge was performed to attach the unique assignee-state ID, location ID, city, and state from PatentsView to the OpenCorporates API results, and the data will be utilized to score the results. Scored records were normalized between a confidence level of 1-10.

In [1]:
### import the libraries used to process the PatentsView and OC data.
import pandas as pd
import numpy as np
import time
import os
import re
import string
import warnings
warnings.filterwarnings('ignore')

### start timer
t0=time.time()

### set the path for the input file and save to variable
res_folder = "../csvResults/"
input_file = "noState_output_cleaned.csv"
a_full=os.path.join(res_folder,input_file)
print(a_full,"\n")

OC_results3=pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset 
display(OC_results3.info(null_counts=True),OC_results3.head())

../csvResults/noState_output_cleaned.csv 

Total time is 0.001428 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623 entries, 0 to 2622
Data columns (total 55 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2623 non-null   int64  
 1   assignee_id                           2623 non-null   object 
 2   name                                  2034 non-null   object 
 3   company_number                        2034 non-null   object 
 4   jurisdiction_code                     2034 non-null   object 
 5   incorporation_date                    2034 non-null   object 
 6   dissolution_date                      551 non-null    object 
 7   company_type                          2018 non-null   object 
 8   registry_url                          1680 non-null   object 
 9   branch                                897 non-null    object 
 10  branch_status

None

Unnamed: 0,ID,assignee_id,name,company_number,jurisdiction_code,incorporation_date,dissolution_date,company_type,registry_url,branch,...,agent_street_address,agent_city,agent_state,agent_zipcode,home_company_name,home_company_jurisdiction_code,controlling_entity_name,controlling_entity_jurisdiction_code,list_of_officers,list_of_filing_dates
0,1,5d5ead2f-1ef7-4db6-a6ce-9cdea523f834,,,,,,,,,...,,,,,,,,,[],[]
1,2,8a841c57-22b6-4ad0-ad42-16532c3ab4fc,,,,,,,,,...,,,,,,,,,[],[]
2,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,"LENDINGTREE, LLC",68510F,us_ak,1/3/2000,,Limited Liability Company,,F,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['Christopher D. Davies', 'Douglas R Lebda', '...","['2011-07-21', '2011-07-21', '2008-03-17', '20..."
3,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,"LENDINGTREE, LLC",1364151-0161,us_ut,6/27/1997,,LLC - Foreign,https://secure.utah.gov/bes/details.html?entit...,F,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['INCORP SERVICES, INC']",[]
4,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,"LENDINGTREE, LLC",34570412Q,us_la,8/29/1997,,Limited Liability Company (Non-Louisiana),http://coraweb.sos.la.gov/commercialsearch/Com...,F,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['DOUGLAS LEBDA', 'INCORP SERVICES, INC']","['2017-04-06', '2015-10-18', '2013-01-28', '20..."


In [2]:
### start timer
t0=time.time()

### set the path for the full PatentsView dataset and save to variable. This data will be used
### to assign the ID, location_id, cities, and states to the assigneesWithNoOCRecord variable
res_folder = "../csvResults/"

### removed and replace with organizations that are US-based only
# input_file = "dfMergedFullDataSet.csv"

input_file = "dfMergedFullDataSetNoForeign.csv"
a_full = os.path.join(res_folder,input_file)
print(a_full,"\n")

fullData = pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(fullData.info(null_counts=True),fullData.head())

../csvResults/dfMergedFullDataSetNoForeign.csv 

Total time is 0.007159 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252010 entries, 0 to 252009
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   ID              252010 non-null  int64 
 1   assignee_id     252010 non-null  object
 2   location_id     252010 non-null  object
 3   organization    252010 non-null  object
 4   city            252010 non-null  object
 5   state           252010 non-null  object
 6   dateOfFirstPat  252010 non-null  object
dtypes: int64(1), object(6)
memory usage: 13.5+ MB


None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat
0,0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,"Close-In Solutions, LLC",Austin,TX,05/02/2005
1,1,fffe8fea-3d13-4016-9429-93653527efa1,fe449928-09bd-11ec-893a-12de62d610b1,"Vita-Stat Neducak Services, Inc.",St. Petersburg,FL,08/18/1977
2,2,fffe4688-bc00-4626-bd89-28921a62f07f,f76d85c4-09bd-11ec-893a-12de62d610b1,"Emerging Technology Systems, L.L.C.",Akron,OH,07/02/1996
3,3,fffe36bb-6dea-4a8b-8bf5-071cf893ceba,fe1cb1c3-cb8f-11eb-9615-121df0c29c1e,"Valley Business Solutions, LLC",Huntsville,AL,03/21/2019
4,4,fffd9c21-3bb1-4471-b316-d172921e3f83,ec16f9be-cb90-11eb-9615-121df0c29c1e,"RAILIAS HOLDINGS, LLC",San Diego,CA,10/16/2019


# Add Trademark Data to the Fulldata Data Set

In [3]:
### start timer
t0=time.time()

### set the path for the full PatentsView dataset and save to variable. This data will be used
### to assign the ID, location_id, cities, and states to the assigneesWithNoOCRecord variable
res_folder = "../csvResults/"

### removed and replace with organizations that are US-based only
# input_file = "dfMergedFullDataSet.csv"

input_file = "orgTrademarkNumbers.csv"
a_full = os.path.join(res_folder,input_file)
print(a_full,"\n")

tmData = pd.read_csv(a_full)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(tmData.info(null_counts=True),tmData.head())

../csvResults/orgTrademarkNumbers.csv 

Total time is 0.014014 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441128 entries, 0 to 441127
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   organization  441128 non-null  object
 1   serial        441128 non-null  object
 2   city          441128 non-null  object
 3   state         441128 non-null  object
 4   ctrlEntity    441128 non-null  object
dtypes: object(5)
memory usage: 16.8+ MB


None

Unnamed: 0,organization,serial,city,state,ctrlEntity
0,!Hey Inc.,[75803548 76015762],North Andover,Massachusetts,Delaware
1,"!Magine This Renovations, Llc",[77689117 77731638],Navarre,Ohio,Ohio
2,"!Maginethis Renovations, Llc",[77689117 77731638],Navarre,Ohio,Ohio
3,"""21"" Brands, Inc.",[71591446 72056354 73478437 73551909],New York,New York,New York
4,"""21"" Club, Inc.",[71611185 72212922 72268705],New York,New York,New York


In [4]:
### start timer
t0=time.time()

addTm=fullData.merge(tmData,on=['organization'],how='outer',indicator=True)
addTm1=addTm.loc[(addTm['_merge']=='left_only') | (addTm['_merge']=='both')].iloc[:,:11].reset_index(drop=True)

addTm1.rename(columns={'city_x':'city','state_x':'state',
                       'city_y':'tradeCity','state_y':'tradeState'},inplace=True)

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(addTm1.info(null_counts=True),addTm1.head())

Total time is 0.012000 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269661 entries, 0 to 269660
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              269661 non-null  float64
 1   assignee_id     269661 non-null  object 
 2   location_id     269661 non-null  object 
 3   organization    269661 non-null  object 
 4   city            269661 non-null  object 
 5   state           269661 non-null  object 
 6   dateOfFirstPat  269661 non-null  object 
 7   serial          47559 non-null   object 
 8   tradeCity       47559 non-null   object 
 9   tradeState      47559 non-null   object 
 10  ctrlEntity      47559 non-null   object 
dtypes: float64(1), object(10)
memory usage: 22.6+ MB


None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,ctrlEntity
0,0.0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,"Close-In Solutions, LLC",Austin,TX,05/02/2005,,,,
1,1.0,fffe8fea-3d13-4016-9429-93653527efa1,fe449928-09bd-11ec-893a-12de62d610b1,"Vita-Stat Neducak Services, Inc.",St. Petersburg,FL,08/18/1977,,,,
2,2.0,fffe4688-bc00-4626-bd89-28921a62f07f,f76d85c4-09bd-11ec-893a-12de62d610b1,"Emerging Technology Systems, L.L.C.",Akron,OH,07/02/1996,,,,
3,3.0,fffe36bb-6dea-4a8b-8bf5-071cf893ceba,fe1cb1c3-cb8f-11eb-9615-121df0c29c1e,"Valley Business Solutions, LLC",Huntsville,AL,03/21/2019,,,,
4,4.0,fffd9c21-3bb1-4471-b316-d172921e3f83,ec16f9be-cb90-11eb-9615-121df0c29c1e,"RAILIAS HOLDINGS, LLC",San Diego,CA,10/16/2019,,,,


# Standardize the Organization Names from the PatentsView Data and OpenCorporates Results

In [5]:
### this section cleans the organization and name fields by standardizing the names for the
### organizations between the PatentsView and OpenCorporates data to facilitate better merge
### results. Skipping this section will yield poor results in any subsequent merges.

### start timer
t0=time.time()

### the pattern variable is utilized to create a 'list' of possible suffixes that should be removed
### from the organization and name fields. This list was constructed by manually inspecting the names
### prior to cleaning and does not represent a comprehensive list
pattern = '|'.join(['Llc','L\.L\.C\.','Inc\.$','Inc$','Ltd','\(.+?\)','Plc','P\.L\.C\.','Pllc','P\.L\.L\.C\.',
                    'Lp\.$','Lp$','Llp$','LP','L\.P\.','LC','L\.C\.','Ag$','Gmbh','SA$','Kg','Pvt','Sa$','BV','Nv$',
                    'Ab$','Pty$','SPA$','S\.P\.A\.','Bv','B\.V\.','B\.v\.','@','\.',','])

### converts the first character in each word to Uppercase and remaining characters to Lowercase in 
### the string, followed by removing any whitespace that may exist to the left and right of the strings
addTm1['organization']=addTm1['organization'].str.title()
addTm1['organization']=addTm1['organization'].str.lstrip().str.rstrip()

### some strings must be replaced rather than removed because the resulting organization names would
### not make sense or match incorrectly. For example, Arjang & Co., which is the full name for the
### organization, would become Arjang and would match to multiple records via the merge instead of
### one. This was observed through multiple trials of cleaning the data
addTm1['organization']=addTm1['organization'].str.replace(' & ', ' And ')
addTm1['organization']=addTm1['organization'].str.replace('&', ' And ')
addTm1['organization']=addTm1['organization'].str.replace(' - |-', ' ')
addTm1['organization']=addTm1['organization'].str.replace('+', ' ')
addTm1['organization']=addTm1['organization'].str.replace(' (Co\.$|Co$)', ' Company')
addTm1['organization']=addTm1['organization'].str.replace(' (Corp\.|Corp) ', ' Corporation ')
addTm1['organization']=addTm1['organization'].str.replace('Mfg', 'Manufacturing')
addTm1['organization']=addTm1['organization'].str.replace('Incorporated|Usa', '')

### apply the pattern variable to the organization field and clean the resulting whitespace to the
### left and right of the strings
addTm1['organization']=addTm1['organization'].str.replace(pattern, '')
addTm1['organization']=addTm1['organization'].str.lstrip().str.rstrip()

### convert any remaining names that are not standardized
addTm1['organization']=addTm1['organization'].str.replace(' (Co\.$|Co$)', ' Company')
addTm1['organization']=addTm1['organization'].str.replace(' (Corp\.$|Corp$) ', ' Corporation ')
addTm1['organization']=addTm1['organization'].str.replace(' (Corp\.$|Corp$)', ' Corporation')

### converts the first character in each word to Uppercase and remaining characters to Lowercase in 
### the string, followed by removing any whitespace that may exist to the left and right of the strings
OC_results3['name']=OC_results3['name'].str.title()
OC_results3['name']=OC_results3['name'].str.lstrip().str.rstrip()

### some strings must be replaced rather than removed because the resulting organization names would
### not make sense or match incorrectly. For example, Arjang & Co., which is the full name for the
### organization, would become Arjang and would match to multiple records via the merge instead of
### one. This was observed through multiple trials of cleaning the data
OC_results3['name']=OC_results3['name'].str.replace(' & ', ' And ')
OC_results3['name']=OC_results3['name'].str.replace('&', ' And ')
OC_results3['name']=OC_results3['name'].str.replace(' - |-', ' ')
OC_results3['name']=OC_results3['name'].str.replace('+', ' ')
OC_results3['name']=OC_results3['name'].str.replace(' (Co\.$|Co$)', ' Company')
OC_results3['name']=OC_results3['name'].str.replace(' (Corp\.|Corp) ', ' Corporation ')
OC_results3['name']=OC_results3['name'].str.replace('Mfg', 'Manufacturing')
OC_results3['name']=OC_results3['name'].str.replace('Incorporated|Usa', '')

### apply the pattern variable to the name field and clean the resulting whitespace to the
### left and right of the strings
OC_results3['name']=OC_results3['name'].str.replace(pattern, '')
OC_results3['name']=OC_results3['name'].str.lstrip().str.rstrip()

### convert any remaining names that are not standardized
OC_results3['name']=OC_results3['name'].str.replace(' (Co\.$|Co$)', ' Company')
OC_results3['name']=OC_results3['name'].str.replace(' (Corp\.$|Corp$) ', ' Corporation ')
OC_results3['name']=OC_results3['name'].str.replace(' (Corp\.$|Corp$)', ' Corporation')

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

Total time is 0.064498 mins


In [6]:
### start timer
t0=time.time()

addTm1['tradeState']=addTm1['tradeState'].str.lower()
addTm1['ctrlEntity']=addTm1['ctrlEntity'].str.lower()

#convert states from full names to two-letter abbreviations under the address_state and agent_state features
addTm1.replace({'tradeState':{"california":"ca","connecticut":"ct","massachusetts":"ma","florida":"fl",
                                          "georgia":"ga","washington":"wa","new york":"ny","delaware":"de","tennessee":"tn",
                                          "missouri":"mo","texas":"tx","indiana":"in","minnesota":"mn","pennsylvania":"pa",
                                          "oregon":"or","virginia":"va","illinois":"il","kentucky":"ky","north carolina":"nc",
                                          "new jersey":"nj","colorado":"co","maryland":"md","ohio":"oh","arizona":"az",
                                          "nevada":"nv","utah":"ut","michigan":"mi","new hampshire":"nh","vermont":"vt",
                                          "kansas":"ks","oklahoma":"ok","iowa":"ia","louisiana":"la","rhode island":"ri",
                                          "wisconsin":"wi","hawaii":"hi","montana":"mt","nebraska":"ne",
                                          "district of columbia":"dc","west virginia":"wv","alabama":"al","idaho":"id",
                                          "maine":"me","new mexico":"nm","south carolina":"sc","north dakota":"nd",
                                          "south dakota":"sd","arkansas":"ar","alaska":"ak","wyoming":"wy",
                                          "mississippi":"ms"}},inplace=True)

addTm1.replace({'ctrlEntity':{"california":"ca","connecticut":"ct","massachusetts":"ma","florida":"fl",
                                          "georgia":"ga","washington":"wa","new york":"ny","delaware":"de","tennessee":"tn",
                                          "missouri":"mo","texas":"tx","indiana":"in","minnesota":"mn","pennsylvania":"pa",
                                          "oregon":"or","virginia":"va","illinois":"il","kentucky":"ky","north carolina":"nc",
                                          "new jersey":"nj","colorado":"co","maryland":"md","ohio":"oh","arizona":"az",
                                          "nevada":"nv","utah":"ut","michigan":"mi","new hampshire":"nh","vermont":"vt",
                                          "kansas":"ks","oklahoma":"ok","iowa":"ia","louisiana":"la","rhode island":"ri",
                                          "wisconsin":"wi","hawaii":"hi","montana":"mt","nebraska":"ne",
                                          "district of columbia":"dc","west virginia":"wv","alabama":"al","idaho":"id",
                                          "maine":"me","new mexico":"nm","south carolina":"sc","north dakota":"nd",
                                          "south dakota":"sd","arkansas":"ar","alaska":"ak","wyoming":"wy",
                                          "mississippi":"ms"}},inplace=True)

addTm1['tradeState']=addTm1['tradeState'].str.title()
addTm1['ctrlEntity']=addTm1['ctrlEntity'].str.title()
addTm1['state']=addTm1['state'].str.title()

### end timer and print total time
t1 = time.time()
total = t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset 
display(addTm1.info(null_counts=True),addTm1.head())

Total time is 0.008512 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269661 entries, 0 to 269660
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ID              269661 non-null  float64
 1   assignee_id     269661 non-null  object 
 2   location_id     269661 non-null  object 
 3   organization    269661 non-null  object 
 4   city            269661 non-null  object 
 5   state           269661 non-null  object 
 6   dateOfFirstPat  269661 non-null  object 
 7   serial          47559 non-null   object 
 8   tradeCity       47559 non-null   object 
 9   tradeState      47559 non-null   object 
 10  ctrlEntity      47559 non-null   object 
dtypes: float64(1), object(10)
memory usage: 22.6+ MB


None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,ctrlEntity
0,0.0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,Close In Solutions,Austin,Tx,05/02/2005,,,,
1,1.0,fffe8fea-3d13-4016-9429-93653527efa1,fe449928-09bd-11ec-893a-12de62d610b1,Vita Stat Neducak Services,St. Petersburg,Fl,08/18/1977,,,,
2,2.0,fffe4688-bc00-4626-bd89-28921a62f07f,f76d85c4-09bd-11ec-893a-12de62d610b1,Emerging Technology Systems,Akron,Oh,07/02/1996,,,,
3,3.0,fffe36bb-6dea-4a8b-8bf5-071cf893ceba,fe1cb1c3-cb8f-11eb-9615-121df0c29c1e,Valley Business Solutions,Huntsville,Al,03/21/2019,,,,
4,4.0,fffd9c21-3bb1-4471-b316-d172921e3f83,ec16f9be-cb90-11eb-9615-121df0c29c1e,Railias Holdings,San Diego,Ca,10/16/2019,,,,


# Extract Alternative and Previous Organization Names and Append to the OpenCorporates Result Dataset

In [7]:
### consecutive regex processes were performed to extract one or more organization names from the
### alternative_names and previous_names columns. When a match is absent, the if statement skips
### that record. If a match is found, the first regex variable is used to find all instances that
### match the pattern, then the second regex variable is utilized to further refine the string to
### reduce it to the organization name. A list is created using the nested for loop for each record.
### If multiple names are present, a nested list is created for that record. After the for loops 
### are finished running, the final list is created and printed for reviewing. This description
### applies for processing both the alternative_names and previous_names columns.

### start timer
t0=time.time()

### converts the first character in each word to Uppercase and remaining characters to Lowercase in 
### the string
OC_results3['alternative_names']=OC_results3['alternative_names'].str.title()
OC_results3['previous_names']=OC_results3['previous_names'].str.title()

### regex and regex1 were applied to initially find a list of organization names, followed by refining
### the names by removing erroneous characters
regex="'Company_Name': '[A-Z].+?'"
regex1="'[A-Z].+': "

### determines the length of the input data and creates 2 empty lists to be filled with the extracted
### organization names
b=len(OC_results3)
sub_finalAlt = []
finalAlt = []

### the for loop is initiated to review each record individually to extract the organization names
### under the alternative_names column
for j in range(b):
    
    ### if the record is empty or nan the record will be skipped
    if pd.isna(OC_results3.iloc[j,20]) is True:
        pass
    
    ### non-empty records have the curly brackets removed from the right and left of the list of
    ### strings. The findall function searches the list for all instances that match the regex
    ### variable and saves the list to the match variable. The length of this list is determined
    ### and provide to the nested loop below
    else:
        a=OC_results3.iloc[j,20][2:-2]
        match = re.findall(regex, a)
        c=len(match)
        
        ### for any length of c greater than 0, this for loop will evaluate each instance for the
        ### organization name by using the regex1 variable to remove characters that are not
        ### required. Once a match is found, the first character in each string is converted to
        ### an uppercase and saved to the sub_finalAlt list. The for loop repeats as many times
        ### that is equal to c
        for i in range(c):
            match1 = re.sub(regex1,"",match[i])
            match1=match1.replace("'","").title()
            
            ### ensures all spaces between strings are a single space
            match1 = re.sub('\s+',' ',match1)
            
            ### replaces specific characters
            match1=re.sub(' & ', ' And ',match1)
            match1=match1.replace('&', ' And ')
            match1=re.sub('Mfg', 'Manufacturing',match1)

            ### convert any names that are not standardized
            match1=re.sub(', Inc\.| Usa, Incorporated| Usa| Inc\.| Inc$| Incorpor$| Incorporated$|Incorporated', '',match1)
            match1=re.sub(', Llc$| Ltd\.| Ltd| Limited| Pty\.| Pty', '',match1)
            match1=re.sub('L\.L\.C\.| Llp$| Llc$|\(.+?\)|,Inc\.', '',match1)
            match1=re.sub(', P\.C\.| P\.C\.| P\. C\.| D\.M\.D\.|D\.D\.S\.| D\. D\. S\.| M\.D\.', '',match1)
            match1=re.sub(' Corporation$| Corporation,$| Corp\.$ | Corp.$| Corp\.$| Corp\.,$', 'Corporation',match1)
            match1=re.sub('Co\.$| Co$', 'Company',match1)
            
            ### removes all punctuation and ensures all spaces were single spaced after processing
            ### the strings
            match1 = match1.translate(str.maketrans('','',string.punctuation))
            match1 = re.sub('\s+',' ',match1)
            
            ### appends the strings to list
            sub_finalAlt.append(match1)
    
    ### after the nested for loop is finished extracting all possible organization names, the
    ### sub_finalAlt list is appended to the finalAlt list. The finalAlt list is updated with 
    ### the alternative names for each record after the for loops are finished and resets the
    ### sub_finalAlt to an empty list
    finalAlt.append(sub_finalAlt)
    sub_finalAlt = []

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins\n")

### prints the finalAlt list for review
print(finalAlt)

Total time is 0.001644 mins

[[], [], [], ['Servicemagic Home Loans', 'RealestateCom Realtors', 'Servicemagic Real Estate', 'Lendingtree PartnersCom', 'Magnifymoney', 'Getsmart', 'RealestateCom', 'Depositaccounts', 'Onlinebanks', 'Ramsey Group A Real EstateCom Company', 'Milecards', 'Student Loan Hero Of Ut', 'GetsmartCom', 'Snapcap'], [], ['RealestateCom Cancelled', 'GetsmartCom', 'Servicemagic Real Estate', 'Servicemagic Home Loans', 'Getsmart', 'RealestateCom Realtors Cancelled', 'Lendingtree PartnersCom', 'Depositaccounts', 'Magnifymoney', 'Milecards', 'Snapcap', 'Student Loan Hero'], ['Getsmart', 'Servicemagic Real Estate', 'GetsmartCom', 'Ramsey Group A Real EstateCom Company', 'Snapcap', 'RealestateCom', 'Servicemagic Home Loans', 'Onlinebanks', 'Lendingtree PartnersCom', 'Depositaccounts', 'Milecards', 'Student Loan Hero Of Ut', 'RealestateCom Realtors', 'Magnifymoney'], [], [], ['Nordstrom Audco', 'Nordstrom Valves'], [], [], [], [], [], [], [], [], ['Minop Company'], [], [], 

In [8]:
### start timer
t0=time.time()

### creates 2 empty lists to be filled with the extracted organization names
sub_finalPre = []
finalPre = []

regex="'Company_Name': '[A-Za-z].+?'"
regex1="'[A-Za-z].+': "

### the for loop is initiated to review each record individually to extract the organization names
### under the previous_names column
for j in range(b):
    
    ### if the record is empty or nan the record will be skipped
    if pd.isna(OC_results3.iloc[j,21]) is True:
        pass
    
    ### non-empty records have the curly brackets removed from the right and left of the list of
    ### strings. The findall function searches the list for all instances that match the regex
    ### variable and saves the list to the match variable. The length of this list is determined
    ### and provide to the nested loop below
    else:
        a=OC_results3.iloc[j,21][2:-2]
        match = re.findall(regex, a)
        c=len(match)

        ### for any length of c greater than 0, this for loop will evaluate each instance for the
        ### organization name by using the regex1 variable to remove characters that are not
        ### required. Once a match is found, the first character in each string is converted to
        ### an uppercase and saved to the sub_finalPre list. The for loop repeats as many times
        ### that is equal to c
        for i in range(c):
            match1 = re.sub(regex1,"",match[i])
            match1=match1.replace("'","").title().lstrip().rstrip()
            
            ### ensures all spaces between strings are a single space
            match1 = re.sub('\s+',' ',match1)
            
            ### replaces specific characters
            match1=re.sub(' & ', ' And ',match1)
            match1=match1.replace('&', ' And ')
            match1=re.sub('Mfg', 'Manufacturing',match1)

            ### convert any names that are not standardized
            match1=re.sub(', Inc\.| Usa, Incorporated| Usa| Inc\.| Inc$| Incorpor$| Incorporated$|Incorporated', '',match1)
            match1=re.sub(', Llc$| Ltd\.| Ltd| Limited| Pty\.| Pty', '',match1)
            match1=re.sub('L\.L\.C\.| Llp$| Llc$|\(.+?\)|,Inc\.', '',match1)
            match1=re.sub(', P\.C\.| P\.C\.| P\. C\.| D\.M\.D\.|D\.D\.S\.| D\. D\. S\.| M\.D\.', '',match1)
            match1=re.sub(' Corporation$| Corporation,$| Corp\.$ | Corp.$| Corp\.$| Corp\.,$', 'Corporation',match1)
            match1=re.sub('Co\.$| Co$', 'Company',match1)
            
            ### removes all punctuation and ensures all spaces were single spaced after processing
            ### the strings
            match1 = match1.translate(str.maketrans('','',string.punctuation))
            match1 = re.sub('\s+',' ',match1)

            sub_finalPre.append(match1)

    ### after the nested for loop is finished extracting all possible organization names, the
    ### sub_finalPre list is appended to the finalAlt list. The finalPre list is updated with 
    ### the previous names for each record after the for loops are finished and resets the
    ### sub_finalPre to an empty list
    finalPre.append(sub_finalPre)
    sub_finalPre = []

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins\n")

### prints the finalPre list for review
print(finalPre)

Total time is 0.001247 mins

[[], [], ['Lendingtree'], ['Creditsource', 'Lendingtree'], ['Lendingtree', 'Creditsource'], [], ['Lendingtree', 'Creditsource'], [], ['Creditsource', 'Lending Tree'], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Carson And Burger', 'Carson Burger And Weekly', 'Cbw Automation'], [], [], [], [], [], [], ['Las'], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Las Enterprises'], [], [], [], [], [], [], [], [], [], [], [], ['Molecular Simulations', 'PolygenCorporation'], [], [], [], [], [], [], ['Powervar Canada', 'Twin City Computers'], [], [], [], [], [], [], [], [], [], ['Jibjab Media'], ['Jibjab Media'], [], [], [], [], [], [], ['Dielectrics Industries', 'Dielectrics'], [], [], [], [], [], [], [], [], [], [], [], [], [], ['Paul Revere TobaccoCorporation'], [], ['Paul Revere Equity Management'], ['Paul Revere Equity Management Company The'], ['Wake Up America The Reds Are Coming Paul RevereJoeCorporation'], [], 

In [9]:
### start timer
t0=time.time()

### append the OC_results dataset with the finalAlt and finalPre lists with a _clean suffix
OC_results3['alternative_names_clean'] = finalAlt
OC_results3['previous_names_clean'] = finalPre

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(OC_results3.info(),OC_results3.head())

Total time is 0.000050 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623 entries, 0 to 2622
Data columns (total 57 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2623 non-null   int64  
 1   assignee_id                           2623 non-null   object 
 2   name                                  2034 non-null   object 
 3   company_number                        2034 non-null   object 
 4   jurisdiction_code                     2034 non-null   object 
 5   incorporation_date                    2034 non-null   object 
 6   dissolution_date                      551 non-null    object 
 7   company_type                          2018 non-null   object 
 8   registry_url                          1680 non-null   object 
 9   branch                                897 non-null    object 
 10  branch_status                         897 non-null    ob

None

Unnamed: 0,ID,assignee_id,name,company_number,jurisdiction_code,incorporation_date,dissolution_date,company_type,registry_url,branch,...,agent_state,agent_zipcode,home_company_name,home_company_jurisdiction_code,controlling_entity_name,controlling_entity_jurisdiction_code,list_of_officers,list_of_filing_dates,alternative_names_clean,previous_names_clean
0,1,5d5ead2f-1ef7-4db6-a6ce-9cdea523f834,,,,,,,,,...,,,,,,,[],[],[],[]
1,2,8a841c57-22b6-4ad0-ad42-16532c3ab4fc,,,,,,,,,...,,,,,,,[],[],[],[]
2,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,68510F,us_ak,1/3/2000,,Limited Liability Company,,F,...,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['Christopher D. Davies', 'Douglas R Lebda', '...","['2011-07-21', '2011-07-21', '2008-03-17', '20...",[],[Lendingtree]
3,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,1364151-0161,us_ut,6/27/1997,,LLC - Foreign,https://secure.utah.gov/bes/details.html?entit...,F,...,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['INCORP SERVICES, INC']",[],"[Servicemagic Home Loans, RealestateCom Realto...","[Creditsource, Lendingtree]"
4,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,34570412Q,us_la,8/29/1997,,Limited Liability Company (Non-Louisiana),http://coraweb.sos.la.gov/commercialsearch/Com...,F,...,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['DOUGLAS LEBDA', 'INCORP SERVICES, INC']","['2017-04-06', '2015-10-18', '2013-01-28', '20...",[],"[Lendingtree, Creditsource]"


In [10]:
### drop the alternative_names and previous_names columns and rearrange the alternative_names_clean 
### and previous_names_clean features
OC_results4=OC_results3
OC_results4.drop(labels=['alternative_names','previous_names'],axis=1,inplace=True)

thr_col = OC_results4.pop('alternative_names_clean')
for_col = OC_results4.pop('previous_names_clean')

OC_results4.insert(3, 'alternative_names_clean', thr_col)
OC_results4.insert(4, 'previous_names_clean', for_col)

### print general stats and first 5 records for dataset
display(OC_results4.info(),OC_results4.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623 entries, 0 to 2622
Data columns (total 55 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2623 non-null   int64  
 1   assignee_id                           2623 non-null   object 
 2   name                                  2034 non-null   object 
 3   alternative_names_clean               2623 non-null   object 
 4   previous_names_clean                  2623 non-null   object 
 5   company_number                        2034 non-null   object 
 6   jurisdiction_code                     2034 non-null   object 
 7   incorporation_date                    2034 non-null   object 
 8   dissolution_date                      551 non-null    object 
 9   company_type                          2018 non-null   object 
 10  registry_url                          1680 non-null   object 
 11  branch           

None

Unnamed: 0,ID,assignee_id,name,alternative_names_clean,previous_names_clean,company_number,jurisdiction_code,incorporation_date,dissolution_date,company_type,...,agent_street_address,agent_city,agent_state,agent_zipcode,home_company_name,home_company_jurisdiction_code,controlling_entity_name,controlling_entity_jurisdiction_code,list_of_officers,list_of_filing_dates
0,1,5d5ead2f-1ef7-4db6-a6ce-9cdea523f834,,[],[],,,,,,...,,,,,,,,,[],[]
1,2,8a841c57-22b6-4ad0-ad42-16532c3ab4fc,,[],[],,,,,,...,,,,,,,,,[],[]
2,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,[],[Lendingtree],68510F,us_ak,1/3/2000,,Limited Liability Company,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['Christopher D. Davies', 'Douglas R Lebda', '...","['2011-07-21', '2011-07-21', '2008-03-17', '20..."
3,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,"[Servicemagic Home Loans, RealestateCom Realto...","[Creditsource, Lendingtree]",1364151-0161,us_ut,6/27/1997,,LLC - Foreign,...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['INCORP SERVICES, INC']",[]
4,3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,[],"[Lendingtree, Creditsource]",34570412Q,us_la,8/29/1997,,Limited Liability Company (Non-Louisiana),...,,,,,"LENDINGTREE, LLC",us_de,"LENDINGTREE, LLC",us_de,"['DOUGLAS LEBDA', 'INCORP SERVICES, INC']","['2017-04-06', '2015-10-18', '2013-01-28', '20..."


In [11]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000ExtAltPreNamesNew.csv"
a_full = os.path.join(res_folder,outpt_file)

# OC_results4.to_csv(a_full,index=False)

# Extract Additional City and States from the Data Field and Append to the OpenCorporates Dataset

In [12]:
### this library allows the user to remove all punctuation in a string
import string 

### start timer
t0=time.time()

### the regex variables are applied to extract substrings in the data field and sequentially processed
### to remove most of the erroneous data
regex  = "'description': '[A-Za-z0-9].+?'"
regex1 = ": '[A-za-z0-9].+': "

### the b variable sets the length of the input data and the list of states will be used to extract the
### states from the data being processed. The empty lists are populated with the extracted data and will
### be utilized appended to the input dataframe
b=len(OC_results4)
stateList = "Al$|Ak|Az$|Ar$|Ca$|Co$|Ct$|Dc$|De$|Fl$|Ga$|Hi$|Id$|Il$|In$|Ia$|Ky$|Ks$|La$|Me$|Ma$|Md$|Mi$|Mn$|Ms$|Mo$| \
             Mt$|Ne$|Nv$|Nh$|Nj$|Nm$|Ny$|Nc$|Nd$|Oh$|Ok$|Or$|Pa$|Ri$|Sc$|Sd$|Tn$|Tx$|Ut$|Vt$|Va$|Wa$|Wv$|Wi$|Wy$|Fl$"
sub_city = []
city = []
sub_states1 = []
states = []

### this for loop initiates the search for city-state pairs within the data field. If there was no data
### found in the cell (Nan or Na), the if statement enters a blank for that record and moves to the next
### record. When a record was identified to contain data, the regex variable was applied with the findall
### function to extract all data matching the pattern. Lastly, the length of the match variable was
### determined and passed to the nexted for loop
for j in range(b):
    
    ### enters a blank for the record if no data is present
    if pd.isna(OC_results4.iloc[j,33]) is True:
        sub_states1 = []
        sub_city= []

    else:
        ### select the non-empty record and apply the findall function to obtain all matches, then
        ### calculate the length of the resulting string(s)
        a=OC_results4.iloc[j,33]
        match=re.findall(regex,a)
        c=len(match)

        ### each instance of city-state pairs identified in the findall function above were processed and
        ### standardized (e.g., city name two-letter state abbreviation). The following does not represent
        ### a comprehensive list of standardizing city-states
        for i in range(c):
            match1 = re.sub(regex1,"",match[i])
            match1 = re.sub('[0-9]',"",match1)
            match1 = re.sub('[0-9]',"",match1)
            match1 = re.sub('\s+',' ',match1)
            match1 = match1.replace("'","").title()
            match1 = match1.replace('Description: ',"")
            match1 = match1.replace('\\N'," ")
            match1 = match1.replace('\\n'," ")
            match1 = match1.replace('-',"")
            match1 = match1.replace('#',"")
            match1 = re.sub('Us$|USA|Usa|United States Of America|United States|Register Id:',"",match1)
            match1 = re.sub('Suite|Ste',"",match1)
            match1 = re.sub('Http.+\.[Cc]om',"",match1)
            match1 = re.sub(',$',"",match1)
            match1 = re.sub('/|:',"",match1)
            match1 = re.sub('[A-z].+(Department|Dept)\.|[A-z].+Dept',"",match1)
            match1 = re.sub('[A-z].+Bates',"",match1)
            match1 = re.sub('[A-z].+Larocque,',"",match1)
            match1 = re.sub('Apt ([A-z]|[A-z]\.)|Apt\.',"",match1)
            match1 = re.sub('[Pp].+?Box,|[Pp].+?Box ,|Po ,|Box , ',"",match1)
            match1 = re.sub('[A-z].+?Cook,',"",match1)
            match1 = re.sub('[A-z].+?Compliance, ',"",match1)
            match1 = re.sub('Th Floor, |Th Fl|Th Fl,',"",match1)
            match1 = re.sub('[A-z].+?Nova, ',"",match1)
            match1 = re.sub('[A-z].+?Ave,',"",match1)
            match1 = re.sub('[A-z].+?(Center|Ctr),',"",match1)
            match1 = re.sub('[A-z].+(Lane|Ln)',"",match1)            
            match1 = re.sub('(Bldg|Bldg\.) ([A-z]|[A-z],|[A-z]\.)|Bldg\.|Bldg',"",match1)
            match1 = re.sub('(Rd|Rd\.) (Floor,|Fl\.,|Fl,)',"",match1)
            match1 = re.sub('[A-Z].+?(Road|Rd\.|Rd)',"",match1)
            match1 = re.sub('[A-Z].+?Blvd|%Comet Glass Co, |Lcr ',"",match1)
            match1 = re.sub('[A-Z].+?Capitol| , By Corinne M Lude|(B,|B) |C, |^r, |^Z |^z ',"",match1)
            match1 = re.sub('[A-Z].+?Accounting, |[A-Z].+?Siuta, |Santa Helena|Fl , |E Th Saint ',"",match1)          
            match1 = re.sub('N Bay Village Fla',"North Bay Village, Fl",match1)
            match1 = re.sub('Mt\.|Mt',"Mount",match1)
            match1 = re.sub('Ft\.|Ft',"Mount",match1)
            match1 = re.sub('St\. |St ',"Saint ",match1)
            match1 = re.sub('Spgs',"Springs",match1)
            match1 = re.sub('Th Avenue South East, Mpls, Mn',"Minneapolis, Mn",match1)
            match1 = re.sub('No. Adams,, Ma, ',"North Adams, Ma",match1)
            match1 = re.sub('N Palm Beach, Fl',"North Palm Beach, Fl",match1)
            match1 = re.sub('Lk',"Lake",match1)
            match1 = re.sub('.+?(Pkwy|Parkway)|.+?Saint E, ',"",match1)
            match1 = re.sub('^Inc.+?(Place|Pl) |.+?(Place|Pl), |.+?(Plaza|Plz) ',"",match1)
            match1 = re.sub('Inc\., Irving Pl,',"",match1)
            match1 = re.sub('^Inc.+?(Street|St)|Inc.  N Frederick Ave L',"",match1)
            match1 = re.sub('.+?(Street|Street,|St\.,|St,) ',"",match1)
            match1 = re.sub('^(Inc|Inc\.,).+?(Drive|(Dr,|Dr)) |.+?(Highway|Hwy) |.+?(Freeway|Fwy) |.+?(Way|Wy) ',"",match1)
            match1 = re.sub('.+?(Drive|(Dr,|Dr|Dr\.,|Dr\.)|Drive,) |N Mi Saint',"",match1)
            match1 = re.sub('Glenroy|N Central Ave|Ne, |Nw |.+?Counsel, |.+?(Boulevard,|Boulevard) |Loockerman Square ',"",match1)
            match1 = re.sub('Lafox',"La Fox",match1)
            match1 = re.sub('Plymo$',"Plymouth",match1)
            match1 = re.sub('Portl$',"Portland",match1)
            match1 = re.sub('Flore$',"Florence",match1)
            match1 = re.sub('Southfi$',"Southfield",match1)
            match1 = re.sub('.+?(Longwood|Longwood,) Fl',"Longwood Fl",match1)
            match1 = re.sub('.+?p\.O\. Box|.+?(Highway|Hwy), |Inc. > box|P O Box |%.+?, |.+?(Avenue|Ave) ',"",match1)
            match1 = re.sub('^oor |Inc\.,|.+?(Court|Ct\.), |.+?(Avenue|Ave|Ave\.,), |.+?Ave |Inc\.  El Camino Real',"",match1)
            match1 = re.sub('E&A ennedy Space Ctr.,|E&A , Kennedy Space Ctr.,',"Kennedy Space Center,",match1)
            match1 = re.sub('.+?South Bend, In',"South Bend, In",match1)
            match1 = re.sub('.+?indio Ca',"Indio Ca",match1)
            match1 = re.sub('.+?[Ss]ioux Falls, ',"Sioux Falls, ",match1)
            match1 = re.sub('.+?(Plaza|Plz), |North Th Saint |.+?Belmont, ',"",match1)
            match1 = re.sub('.+?Belleville, Il,',"Belleville Il",match1)
            match1 = re.sub('Woodlawn Saint|s,|Spiceland|.+?(Circle|Cir)|Hillsboro Mile|Saint ,|.+?(Way|Wy)|N Court Saint',"",match1)
            match1 = re.sub('W\. Adam|Heritage Hill|Franklin Saint|.+?[Tt]ower |Pecan Saint W|Curti |.+?(Park|Pk) |S\.W Th',"",match1)
            match1 = re.sub('E Th Saint |Harlin Sr|Pmb|N\. Raymond|S Th|.+?(Pike|Pk)|.+?(Court|Ct)|.+?(Department|Dept)',"",match1)
            match1 = re.sub('West Tenth Saint|.+?Broadway|Boradway|Rr Box R|.+?(Route|Rte)|Box S\.|cl,|El Rio Saint',"",match1)
            match1 = re.sub('Village Square|Orange Saint|N Causeway|Plone Et Al|Madison Saint|P O Drawer Www',"",match1)
            match1 = re.sub('Po Bo|Chisholm Pl|.+?Ave\., |Prescott Saint|Titian|Picacho|Rusk Saint|Market Saint',"",match1)
            match1 = re.sub('Dairy Ashford Saint|Reichhold|Wall Saint|Webster Saint Fl Th|Webster Saint',"",match1)
            match1 = re.sub('(West|W) El Camino Real|Mountain (Trail|Trl)|W Th Saint  E|Rocky Mountain Fiber',"",match1)
            match1 = re.sub('Lake Saint Loui Mo',"Saint Louis Mo",match1)
            match1 = re.sub('Redhill Ave|Th Saint  |Th Saint ste|S\. Akard Saint|Sw Third |Rm |.+?(Loop|Loop,) |Alcor Body Work',"",match1)
            match1 = re.sub('.+?(Boulevard|Bvld)|Ave\., |.+?Solution|Braodway|Larkspur|.+?Turnpike|ele ,|Acorn Saint',"",match1)
            match1 = re.sub('.+?(Martius|Martiu) |Brush Saint |Nagog Park|Elm Saint|P\.O\. Box|Peaks Cv|Ne Brazee',"",match1)
            match1 = re.sub('Se, |Highway North|Rd Saint S|El Camino Real|.+?Management|E Eubank|.+?Product ',"",match1)
            match1 = re.sub('El Camino Real|.+?(Tax |Tax,)|Circle,|Northwestern Hgy|Rr |Putnam Saint|CO.+?Hill',"",match1)
            match1 = re.sub('Bissonnet Saint|S[ew] |West Tenth Stree|S E|Camino Del Tomasini|The Alameda',"",match1)
            match1 = re.sub('North Central Expressway|W Tenth Saint|Saint ste|W. Cummings Park|Prosper Saint',"",match1)
            match1 = re.sub('Glenlake Ave|Namco|N\. Central Ave|suite |N. Military Trail|.+?Alton Pl|Ludelle Saint',"",match1)
            match1 = re.sub('S\.W\. Terr|Dtn, Llc|.+?East Marcy|W Micheltorena|N Thompson Saint|CO Jos M Glickstein',"",match1)
            match1 = re.sub('S. Whittle Ave|S Orange Blossom Tr|Rt\.|Big Duke Trl|Von Karman \(\)|Hwy [A-Z] ',"",match1)
            match1 = re.sub('E Main Saint|Station, |Toledo Saint  A|Number |[EWNS] |ste |Woodlawn Av|Knox Av',"",match1)
            match1 = re.sub('depere, Wi',"De Pere, Wi",match1)
            match1 = re.sub('ScifAnnapolis Junction, Md,',"Annapolis Junction Md",match1)
                        
            ### capitalize the first letter in all strings, remove punctuation and whitespace around the
            ### the city-state strings. States were converted to two-letter abbreviations
            match1 = match1.title()
            match1 = match1.translate(str.maketrans('','',string.punctuation))
            match1 = match1.lstrip().rstrip()
            match1 = re.sub("California","Ca",match1)
            match1 = re.sub("Connecticut","Ct",match1)
            match1 = re.sub("Massachusetts|Massachusett","Ma",match1)
            match1 = re.sub("Nebraska","Ne",match1)
            match1 = re.sub("Florida","Fl",match1)
            match1 = re.sub("Georgia","Ga",match1)
            match1 = re.sub("Washington","Wa",match1)
            match1 = re.sub(" New York"," Ny",match1)
            match1 = re.sub("Delaware","De",match1)
            match1 = re.sub("Tennessee","Tn",match1)
            match1 = re.sub("Missouri","Mo",match1)
            match1 = re.sub("Texas","Tx",match1)
            match1 = re.sub("Indiana","In",match1)
            match1 = re.sub("Pennsylvania","Pa",match1)
            match1 = re.sub("Oregon","Or",match1)
            match1 = re.sub("Virginia","Va",match1)
            match1 = re.sub("Illinois|Illinoi","Il",match1)
            match1 = re.sub("Kentucky","Ky",match1)
            match1 = re.sub("North Carolina","Nc",match1)
            match1 = re.sub("New Jersey","Nj",match1)
            match1 = re.sub("Colorado","Co",match1)
            match1 = re.sub("Maryland","Md",match1)
            match1 = re.sub("Ohio","Oh",match1)
            match1 = re.sub("Arizona","Az",match1)
            match1 = re.sub("Nevada","Nv",match1)
            match1 = re.sub("Utah","Ut",match1)
            match1 = re.sub("Michigan","Mi",match1)
            match1 = re.sub("New Hampshire","Nh",match1)
            match1 = re.sub("Vermont","Vt",match1)
            match1 = re.sub("Kansas","Ks",match1)
            match1 = re.sub("Oklahoma","Ok",match1)
            match1 = re.sub("Iowa","Ia",match1)
            match1 = re.sub("Louisiana","La",match1)
            match1 = re.sub("Rhode Island","Ri",match1)
            match1 = re.sub("Wisconsin","Wi",match1)
            match1 = re.sub("Hawaii","Hi",match1)
            match1 = re.sub("Montana","Mt",match1)
            match1 = re.sub("District Of Columbia","Dc",match1)
            match1 = re.sub("West Virginia","Wv",match1)
            match1 = re.sub("Alabama","Al",match1)
            match1 = re.sub("Idaho","Id",match1)
            match1 = re.sub("Maine","Me",match1)
            match1 = re.sub("New Mexico","Nm",match1)
            match1 = re.sub("South Carolina","Sc",match1)
            match1 = re.sub("North Dakota","Nd",match1)
            match1 = re.sub("South Dakota","Sd",match1)
            match1 = re.sub("Arkansas","Ar",match1)
            match1 = re.sub("Alaska","Ak",match1)
            match1 = re.sub("Wyoming","Wy",match1)
            match1 = re.sub("Mississippi","Ms",match1)
            match1 = re.sub("Minnesota","Mn",match1)  
            match1 = re.sub("Virginia","Va",match1)  

            ### final set of cleaning for the city-state pairs prior to adding them to city list and
            ### state list that will be appended to the input dataframe
            match1 = re.sub('Inpoli',"Indianapolis",match1)
            match1 = re.sub('Dalla Tx',"Dallas Tx",match1)
            match1 = re.sub('Fla',"Fl",match1)
            match1 = re.sub('Kipling Memphi Tn|Memphi Tn',"Memphis Tn",match1)
            match1 = re.sub('White Pln Ny',"White Plain Ny",match1)
            match1 = re.sub('X  Hyrum Ut',"Hyrum Ut",match1)
            match1 = re.sub('Saint Loui Mo',"Saint Louis Mo",match1)
            match1 = re.sub('Jenner   Irvine Ca',"Irvine Ca",match1)
            match1 = re.sub('Los Angele Ca|Nh   Los Angeles Ca',"Los Angeles Ca",match1)
            match1 = re.sub('N West Palm Beach Fl',"West Palm Beach Fl",match1)
            match1 = re.sub('Y  Pasadena Tx',"Pasadena Tx",match1)
            match1 = re.sub('S Chicago Il',"Chicago Il",match1)
            match1 = re.sub('J Houston Tx',"Houston Tx",match1)
            match1 = re.sub('R Ny Ny|Saint Ny Ny',"New York Ny",match1)
            match1 = re.sub('Mission Wood K',"Mission Wood Ks",match1)
            match1 = re.sub('Saint Charles Mo|Saint Charle Mo',"Saint Charles Mo",match1)
            match1 = re.sub('Ne Oh',"Oh",match1)
            match1 = re.sub('Brooklyn Ny Ny',"Brooklyn Ny",match1)
            match1 = re.sub('F Austin Tx',"Austin Tx",match1)
            match1 = re.sub('Co Spring Co',"Colorado Springs Co",match1)
            
            ### uses the stateList variable to extract the states from the record. The join function is
            ### applied to prevent brackets added to the data frame in later steps
            sub_states1.append(''.join(re.findall(stateList,match1)))

            ### there are records where only the city is present. This if statement evaluates the
            ### length of the sub_states1 variable. A length equal to 0 corresponds to a record where no
            ### state is associated, only a city name. The string is appended to the sub_city variable.
            ### A length greater than 0 corresponds to a record that has a city-state pair and the
            ### string is sliced to remove the two-letter state and space between the city-state pair.
            ### The result is appended to the sub_city variable
            if len(sub_states1) == 0:
                sub_city.append(match1)
            
            else:
                sub_city.append(match1[:-3])

    ### the city and state is appended to a new list that will be added to the input dataframe. The lists
    ### in the nested for loop above were emptied and recycled for the next round of records.
    states.append(sub_states1)
    city.append(sub_city)
    sub_states1 = []
    sub_city = []

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### add the city and state lists to the input data frame and select the required columns
OC_results4['data_city']=city
OC_results4['data_state']=states

OC_results5=OC_results4.iloc[:,[1,2,3,4,6,7,52,12,42,43,46,47,55,56]]

### print general stats and first 5 records for dataset
display(OC_results5.info(),OC_results5.head())

Total time is 0.008344 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2623 entries, 0 to 2622
Data columns (total 14 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   assignee_id                           2623 non-null   object
 1   name                                  2034 non-null   object
 2   alternative_names_clean               2623 non-null   object
 3   previous_names_clean                  2623 non-null   object
 4   jurisdiction_code                     2034 non-null   object
 5   incorporation_date                    2034 non-null   object
 6   controlling_entity_jurisdiction_code  844 non-null    object
 7   branch_status                         897 non-null    object
 8   address_city                          976 non-null    object
 9   address_state                         961 non-null    object
 10  agent_city                            292 non-null    object
 11  ag

None

Unnamed: 0,assignee_id,name,alternative_names_clean,previous_names_clean,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,5d5ead2f-1ef7-4db6-a6ce-9cdea523f834,,[],[],,,,,,,,,[],[]
1,8a841c57-22b6-4ad0-ad42-16532c3ab4fc,,[],[],,,,,,,,,[],[]
2,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,[],[Lendingtree],us_ak,1/3/2000,us_de,branch of an out-of-jurisdiction company,CHARLOTTE,NC,,,[],[]
3,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,"[Servicemagic Home Loans, RealestateCom Realto...","[Creditsource, Lendingtree]",us_ut,6/27/1997,us_de,branch of an out-of-jurisdiction company,CHARLOTTE,NC,,,[],[]
4,52e38962-9bfc-4082-b0c8-b7ba4ac04d83,Lendingtree,[],"[Lendingtree, Creditsource]",us_la,8/29/1997,us_de,branch of an out-of-jurisdiction company,BATON ROUGE,LA,,,"[Charlotte, Charlotte]","[Nc, Nc]"


In [13]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000ExtAltPreNamesDataCityStatesNew.csv"
a_full = os.path.join(res_folder,outpt_file)

# OC_results5.to_csv(a_full,index=False)

# Perform a Fuzzy Match Between the PatentsView Organization Name and the Alternative_Names_Clean and Previous_Names_Clean Fields

In [14]:
### perform a merge between the OC results and the full dataset to attach the PatentsView city and state
### to prepare the data for fuzzy matching between the two datasets
OC_results5.dropna(subset=['name'],inplace=True)

assingeeIDMerge=addTm1.merge(OC_results5,on=['assignee_id'],how='inner')
assingeeIDMerge['city'].replace("St\. ","Saint ",regex=True,inplace=True)
assingeeIDMerge['address_city'].replace("St\. |St ","Saint ",regex=True,inplace=True)

### print general stats and first 5 records for dataset
display(assingeeIDMerge.info(),assingeeIDMerge.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 24 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   tradeCity                             798 non-null    object 
 9   tradeState                            798 non-null    object 
 10  ctrlEntity                            798 non-null    object 
 11  name             

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,us_va,12/22/1997,,,ROANOKE,Virginia,ROANOKE,Virginia,[],[]
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,4/19/2005,,,,,,,[],[]
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_ca,6/23/1994,,,ANAHEIM,CA,,,[Irvine],[Ca]
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_ri,1/8/2019,us_de,branch of an out-of-jurisdiction company,VINEYARD HAVEN,MA,PROVIDENCE,RI,[],[]
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,1/3/2017,,,,,,,[],[]


In [15]:
### convert the city names and states to capitalize the first character in the string
assingeeIDMerge['state']=assingeeIDMerge['state'].str.title()
assingeeIDMerge['address_city']=assingeeIDMerge['address_city'].str.title()
assingeeIDMerge['address_state']=assingeeIDMerge['address_state'].str.lower()
assingeeIDMerge['agent_city']=assingeeIDMerge['agent_city'].str.title()
assingeeIDMerge['agent_state']=assingeeIDMerge['agent_state'].str.lower()

#convert states from full names to two-letter abbreviations under the address_state and agent_state features
assingeeIDMerge.replace({'address_state':{"california":"ca","connecticut":"ct","massachusetts":"ma","florida":"fl",
                                          "georgia":"ga","washington":"wa","new york":"ny","delaware":"de","tennessee":"tn",
                                          "missouri":"mo","texas":"tx","indiana":"in","minnesota":"mn","pennsylvania":"pa",
                                          "oregon":"or","virginia":"va","illinois":"il","kentucky":"ky","north carolina":"nc",
                                          "new jersey":"nj","colorado":"co","maryland":"md","ohio":"oh","arizona":"az",
                                          "nevada":"nv","utah":"ut","michigan":"mi","new hampshire":"nh","vermont":"vt",
                                          "kansas":"ks","oklahoma":"ok","iowa":"ia","louisiana":"la","rhode island":"ri",
                                          "wisconsin":"wi","hawaii":"hi","montana":"mt","nebraska":"ne",
                                          "district of columbia":"dc","west virginia":"wv","alabama":"al","idaho":"id",
                                          "maine":"me","new mexico":"nm","south carolina":"sc","north dakota":"nd",
                                          "south dakota":"sd","arkansas":"ar","alaska":"ak","wyoming":"wy",
                                          "mississippi":"ms"}},inplace=True)

assingeeIDMerge.replace({'agent_state':{"california":"ca","connecticut":"ct","massachusetts":"ma","florida":"fl",
                                          "georgia":"ga","washington":"wa","new york":"ny","delaware":"de","tennessee":"tn",
                                          "missouri":"mo","texas":"tx","indiana":"in","minnesota":"mn","pennsylvania":"pa",
                                          "oregon":"or","virginia":"va","illinois":"il","kentucky":"ky","north carolina":"nc",
                                          "new jersey":"nj","colorado":"co","maryland":"md","ohio":"oh","arizona":"az",
                                          "nevada":"nv","utah":"ut","michigan":"mi","new hampshire":"nh","vermont":"vt",
                                          "kansas":"ks","oklahoma":"ok","iowa":"ia","louisiana":"la","rhode island":"ri",
                                          "wisconsin":"wi","hawaii":"hi","montana":"mt","nebraska":"ne",
                                          "district of columbia":"dc","west virginia":"wv","alabama":"al","idaho":"id",
                                          "maine":"me","new mexico":"nm","south carolina":"sc","north dakota":"nd",
                                          "south dakota":"sd","arkansas":"ar","alaska":"ak","wyoming":"wy",
                                          "mississippi":"ms"}},inplace=True)

### convert the first letter in each string to Uppercase and lowercase the remaining strings
assingeeIDMerge['address_state']=assingeeIDMerge['address_state'].str.title()
assingeeIDMerge['agent_state']=assingeeIDMerge['agent_state'].str.title()

### print general stats and first 5 records for dataset
display(assingeeIDMerge.info(),assingeeIDMerge.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 24 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   tradeCity                             798 non-null    object 
 9   tradeState                            798 non-null    object 
 10  ctrlEntity                            798 non-null    object 
 11  name             

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,us_va,12/22/1997,,,Roanoke,Va,Roanoke,Va,[],[]
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,4/19/2005,,,,,,,[],[]
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_ca,6/23/1994,,,Anaheim,Ca,,,[Irvine],[Ca]
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_ri,1/8/2019,us_de,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,[],[]
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,1/3/2017,,,,,,,[],[]


In [16]:
### import the libraries required for fuzzy matching; scoring is between the organization names in PatentsView
### against the name, alternative_names_clean, and previous_names_clean features in the OpenCoporates results
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

### start timer
t0=time.time()

### determine the length of the data frame and create 2 empty lists
aa=len(assingeeIDMerge)
mat1=[]
mat2=[]

for i in range(aa):

    ### try/except is used to bypass cells with an NaN. Removing this will force the user
    ### to deal with exception errors
    try:
        ### calculate the fuzzy score between the organization name in PatentsView and name
        ### from the OpenCorporates results for further evaluation
        q=fuzz.ratio(assingeeIDMerge.iloc[i,3], assingeeIDMerge.iloc[i,11])

        ### most simplest and most representative in the data; if the score is 100, it is
        ### a perfect match, otherwise, the remaining code will resolve score less than 100
        if q == 100:
            mat1.append(q)
            mat2.append(assingeeIDMerge.iloc[i,3])
        
        ### this section resolves all q values not equal to 100
        elif ( q != 100 ):

            ### calculate the length of the data in each row for the alternative_names_clean
            ### column and previous_names_clean column
            if len(assingeeIDMerge.iloc[i,12]) == 0:
                r=0
            
            elif len(assingeeIDMerge.iloc[i,12]) > 0:
                r=len(assingeeIDMerge.iloc[i,12])
            
            if len(assingeeIDMerge.iloc[i,13]) == 0:
                d=0
            
            elif len(assingeeIDMerge.iloc[i,13]) > 0:
                d=len(assingeeIDMerge.iloc[i,13])
                        
            ### the remaining parts of the code uses if statements to step through the many
            ### conditions that may be present in the data. As each condition is satisfied,
            ### the mat1 and mat2 lists are appended with the data; take note that q is the
            ### original score and s, v, e, and f are separate scores that are compared
            ### against q. The first if statement evaluates the lengths of r and q, and if
            ### both are zero, skips the remaining code and appends mat1 and mat2 with the
            ### fuzzy score and organization name from OpenCorporates as the top hit
            if r == 0 and d == 0:
                mat1.append(q)
                mat2.append(assingeeIDMerge['name'][i])
                                         
            ### calculate the score for the alternative_names_clean and previous_names_clean
            ### columns. Once calculated, they are compared against each other and q to
            ### determine the score that is highest. The 'best' score is appened to mat1 and
            ### the name of the organization is appended to mat2. The same scorer is utilized 
            ### as the above but the process.extractOne function retrieves the organization
            ### match with the highest score and saves it as a tuple
            elif r == 1 and d == 1:
                s=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['alternative_names_clean'][i],scorer=fuzz.ratio)
                e=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['previous_names_clean'][i],scorer=fuzz.ratio)
                
                if s[1] > e[1] and s[1] > q:
                    mat1.append(s[1])
                    mat2.append(s[0])
                
                elif e[1] > s[1] and e[1] > q:
                    mat1.append(e[1])
                    mat2.append(e[0])
                
                elif e[1] == q or s[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
                
                else:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
            
            ### this section is the same as above, except the code is looking at the
            ### alternative_names_clean column only
            elif r == 1:
                s=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['alternative_names_clean'][i],scorer=fuzz.ratio)

                if s[1] > q:
                    mat1.append(s[1])
                    mat2.append(s[0])
                
                elif s[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
                    
                elif s[1] < q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
            
            ### this section is the same as above, except the code is looking at the
            ### previous_names_clean column only
            elif d == 1:
                e=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['previous_names_clean'][i],scorer=fuzz.ratio)

                if e[1] > q:
                    mat1.append(e[1])
                    mat2.append(e[0])
                
                elif e[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
                    
                elif e[1] < q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])      
            
            ## the following two sections resolve r and d lengths greater than 1 (i.e.,
            ## records that have more than 1 company names in the alternative_names_clean 
            ## and previous_names_clean columns)
            elif r > 1:
                v=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['alternative_names_clean'][i],scorer=fuzz.ratio)                

                if v[1] > q:
                    mat1.append(v[1])
                    mat2.append(v[0])
                    
                elif v[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
                    
                elif v[1] < q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])
            
            elif d > 1:
                f=process.extractOne(assingeeIDMerge['organization'][i],
                                     assingeeIDMerge['previous_names_clean'][i],scorer=fuzz.ratio)

                if f[1] > q:
                    mat1.append(f[1])
                    mat2.append(f[0])
                    
                elif f[1] == q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])                    
                    
                elif f[1] < q:
                    mat1.append(q)
                    mat2.append(assingeeIDMerge['name'][i])

    except:
        ### error handling that places an NaN for every cells that does not have a value
        ### in the mat1 and/or mat2 lists
        mat1.append(np.nan)
        mat2.append(np.nan)

### print general stats and first 5 records for dataset
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

Total time is 0.006194 mins


In [17]:
### start timer
t0=time.time()

### combines mat1 and mat2 into a single dataframe
finalList=[list(w) for w in zip(mat1, mat2)]
finalListDf=pd.DataFrame(finalList,columns=['scores','names'])

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(finalListDf.info(),finalListDf.head())

Total time is 0.000034 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2905 entries, 0 to 2904
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   scores  2905 non-null   int64 
 1   names   2905 non-null   object
dtypes: int64(1), object(1)
memory usage: 45.5+ KB


None

Unnamed: 0,scores,names
0,100,The Egg Factory
1,62,The Vision Tank
2,61,Tank Vision Environmental
3,100,Tank Vision
4,100,Tank Vision


In [18]:
### start timer
t0=time.time()

### appends the original dataframe with the results from the scoring steps in the preceeding code blocks
assingeeIDMerge['nameScores'] = finalListDf['scores']
assingeeIDMerge['matchNames'] = finalListDf['names']

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(assingeeIDMerge.info(),assingeeIDMerge.head())

Total time is 0.000017 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 26 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   tradeCity                             798 non-null    object 
 9   tradeState                            798 non-null    object 
 10  ctrlEntity                            798 non-null    ob

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state,nameScores,matchNames
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,,,Roanoke,Va,Roanoke,Va,[],[],100,The Egg Factory
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,,,,,,,[],[],62,The Vision Tank
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,,,Anaheim,Ca,,,[Irvine],[Ca],61,Tank Vision Environmental
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,us_de,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,[],[],100,Tank Vision
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,,,,,,,[],[],100,Tank Vision


In [19]:
### rearrange and select the features for further evaluation
assingeeIDMerge1=assingeeIDMerge
assingeeIDMerge1.drop(labels=['name','alternative_names_clean','previous_names_clean'],axis=1,inplace=True)

eigh_col = assingeeIDMerge1.pop('nameScores')
nine_col = assingeeIDMerge1.pop('matchNames')

assingeeIDMerge1.insert(8, 'nameScores', eigh_col)
assingeeIDMerge1.insert(9, 'matchNames', nine_col)

### print general stats and first 5 records for dataset
display(assingeeIDMerge1.info(),assingeeIDMerge1.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   nameScores                            2905 non-null   int64  
 9   matchNames                            2905 non-null   object 
 10  tradeCity                             798 non-null    object 
 11  tradeState       

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,nameScores,matchNames,...,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,100,The Egg Factory,...,us_va,12/22/1997,,,Roanoke,Va,Roanoke,Va,[],[]
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,62,The Vision Tank,...,us_de,4/19/2005,,,,,,,[],[]
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,61,Tank Vision Environmental,...,us_ca,6/23/1994,,,Anaheim,Ca,,,[Irvine],[Ca]
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,100,Tank Vision,...,us_ri,1/8/2019,us_de,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,[],[]
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,100,Tank Vision,...,us_de,1/3/2017,,,,,,,[],[]


In [20]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000ExtAltPreNamesDataCityStatesFuzzyNameScoresNew.csv"
a_full = os.path.join(res_folder,outpt_file)

# assingeeIDMerge1.to_csv(a_full,index=False)

In [21]:
### below selects the max fuzzy score for each assignee, but that should not happen now if we are
### looking for controlling jurisdictions

# Match the City-States from the OpenCorporates Data Field to the PatentsView Data 

In [22]:
### start timer
t0=time.time()

### determine the length of data frame to set the range of the for loop
assigneeScores8=assingeeIDMerge1
t=len(assigneeScores8)

### set empty lists to be filled with each record
sub_city  = []
sub_state = []
city  = []
state = []

### for loop searches the data_city and data_state fields and matches to the city and states listed
### under the PatentsView data. If a match is identified the city or state is recorded and saved in
### the sub_city or sub_state list and appended to the city and state list, respectively. Any records
### with no data are skipped
for j in range(t):
    ### the try/except is applied to handle exception errors
    try:
        ### check the size, or number, of cities and states. If zero records exist, the record is
        ### skipped. If either equal one, the string is compared to the PatentsView string. If the
        ### size is larger than one, the nested loops for cities and states is used to check all
        ### instances against the PatentsView record
        d=len(assigneeScores8.iloc[j,21])
        c=len(assigneeScores8.iloc[j,22])

        if d == 0:
            pass
        
        elif d == 1:
            if assigneeScores8.iloc[j,21][0] == assigneeScores8.iloc[j,4]:
                sub_city.append(assigneeScores8.iloc[j,21][0])
            
            elif assigneeScores8.iloc[j,21][0] != assigneeScores8.iloc[j,4]:
                sub_city.append("")

        elif d > 1:
            ### if d is greater than one, the nested for loop is applied to check all instances
            ### against the PatentsView record. If a match is found, break is applied to end
            ### the nested for loop
            for k in range(d):
                if assigneeScores8.iloc[j,21][k]==assigneeScores8.iloc[j,4]:
                    sub_city.append(assigneeScores8.iloc[j,21][k])
                    break                    
        
        ### append the city list with matching records in sub_city and empty the sub_city list
        city.append(''.join(sub_city))
        sub_city = []
        
        
        if c == 0:
            pass
        
        elif c == 1:
            if assigneeScores8.iloc[j,22][0] == assigneeScores8.iloc[j,5]:
                sub_state.append(assigneeScores8.iloc[j,22][0])
                
        elif c > 1:
            ### if c is greater than one, the nested for loop is applied to check all instances
            ### against the PatentsView record. If a match is found, break is applied to end
            ### the nested for loop
            for k in range(c):
                if assigneeScores8.iloc[j,22][k] == assigneeScores8.iloc[j,5]:
                    sub_state.append(assigneeScores8.iloc[j,22][k])
                    break
        
        ### append the state list with matching records in sub_state and empty the sub_state list
        state.append(''.join(sub_state))
        sub_state = []
        
    except:
        city.append(np.nan)
        state.append(np.nan)
        
### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

Total time is 0.004412 mins


In [23]:
### start timer
t0=time.time()

### construct a dataframe with the city and states matching the PatentsView data
dataList=[list(w) for w in zip(city, state)]
dataListDf=pd.DataFrame(dataList,columns=['data_city','data_state']).reset_index(drop=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataListDf.info(),dataListDf.head())

Total time is 0.000033 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2905 entries, 0 to 2904
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   data_city   2905 non-null   object
 1   data_state  2905 non-null   object
dtypes: object(2)
memory usage: 45.5+ KB


None

Unnamed: 0,data_city,data_state
0,,
1,,
2,,
3,,
4,,


In [24]:
### start timer
t0=time.time()

### appends the original dataframe with the city-states that match the PatentsView data and select
### the columns for further processing
assigneeScores8.drop(labels=['data_city','data_state'],axis=1,inplace=True)

assigneeScores8['data_city'] = dataListDf['data_city']
assigneeScores8['data_state'] = dataListDf['data_state']
assigneeScores8['address_city'].replace("St\. ","Saint ",inplace=True,regex=True)
assigneeScores8['address_city'].replace("Mpls","Minneapolis",inplace=True,regex=True)
assigneeScores8['address_city'].replace("Muskegon, Mi","Muskegon",inplace=True,regex=True)

assigneeScores8['address_city'].replace("Null, |Att.+?Rothman,","",inplace=True,regex=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(assigneeScores8.info(),assigneeScores8.head())

Total time is 0.000248 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2905 entries, 0 to 2904
Data columns (total 23 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   ID                                    2905 non-null   float64
 1   assignee_id                           2905 non-null   object 
 2   location_id                           2905 non-null   object 
 3   organization                          2905 non-null   object 
 4   city                                  2905 non-null   object 
 5   state                                 2905 non-null   object 
 6   dateOfFirstPat                        2905 non-null   object 
 7   serial                                798 non-null    object 
 8   nameScores                            2905 non-null   int64  
 9   matchNames                            2905 non-null   object 
 10  tradeCity                             798 non-null    ob

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,nameScores,matchNames,...,jurisdiction_code,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,100,The Egg Factory,...,us_va,12/22/1997,,,Roanoke,Va,Roanoke,Va,,
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,62,The Vision Tank,...,us_de,4/19/2005,,,,,,,,
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,61,Tank Vision Environmental,...,us_ca,6/23/1994,,,Anaheim,Ca,,,,
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,100,Tank Vision,...,us_ri,1/8/2019,us_de,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,100,Tank Vision,...,us_de,1/3/2017,,,,,,,,


In [30]:
### start timer
t0=time.time()

### remove non-US based organizations and construct a new feature containing the state for the record
### from the jurisdiction_code column
assigneeScores9=assigneeScores8[assigneeScores8['jurisdiction_code'].str.contains('us_',na=False)]
assigneeScores9['subJurisCode']=assigneeScores9['jurisdiction_code'].str.slice(3).str.title()

# assigneeScores3=assigneeScores2[assigneeScores2['controlling_entity_jurisdiction_code'].str.contains('us_',na=False)]
assigneeScores9['subCntlEntity']=assigneeScores9['controlling_entity_jurisdiction_code'].str.slice(3).str.title()

assigneeScores10=assigneeScores9.iloc[:,[0,1,2,3,4,5,6,7,10,11,12,8,9,23,
                                         24,14,16,17,18,19,20,21,22]].sort_values(by=['ID'])

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(assigneeScores10.info(),assigneeScores10.head())

Total time is 0.000167 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2696 entries, 0 to 280
Data columns (total 23 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  2696 non-null   float64
 1   assignee_id         2696 non-null   object 
 2   location_id         2696 non-null   object 
 3   organization        2696 non-null   object 
 4   city                2696 non-null   object 
 5   state               2696 non-null   object 
 6   dateOfFirstPat      2696 non-null   object 
 7   serial              715 non-null    object 
 8   tradeCity           715 non-null    object 
 9   tradeState          715 non-null    object 
 10  ctrlEntity          715 non-null    object 
 11  nameScores          2696 non-null   int64  
 12  matchNames          2696 non-null   object 
 13  subJurisCode        2696 non-null   object 
 14  subCntlEntity       1298 non-null   object 
 15  incorporation_date  2696 non

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,serial,tradeCity,tradeState,...,subJurisCode,subCntlEntity,incorporation_date,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,,,,...,Va,,12/22/1997,,Roanoke,Va,Roanoke,Va,,
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,,4/19/2005,,,,,,,
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,Ca,,6/23/1994,,Anaheim,Ca,,,,
3,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,Ri,De,1/8/2019,branch of an out-of-jurisdiction company,Vineyard Haven,Ma,Providence,Ri,,
4,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,,,,...,De,,1/3/2017,,,,,,,


In [31]:
# assigneeScores10.to_csv("../csvResults/aReview.csv",index=False)

In [28]:
### start timer
t0=time.time()

assigneeScoresLoc=pd.DataFrame(assigneeScores8.groupby(by=['organization'])['subJurisCode'].unique())
assigneeScoresLoc.rename(columns={"subJurisCode":"orgLoc"},inplace=True)

assigneeScoresLoc1=assigneeScoresLoc.merge(assigneeScores8,on=['organization'],how='inner')

for q in range(len(assigneeScoresLoc1)):
    assigneeScoresLoc1.iloc[q,1]=','.join(assigneeScoresLoc1.iloc[q,1])

assigneeScoresLoc2=assigneeScoresLoc1.iloc[:,[2,3,4,0,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,1]].sort_values(by=['ID'])

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(assigneeScoresLoc2.info(),assigneeScoresLoc2.head())

KeyError: 'Column not found: subJurisCode'

In [57]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000ExtAltPreNamesDataCityStatesFuzzyNameScoresPreFilteredLabelStatesMatchCityStatesNew.csv"
a_full = os.path.join(res_folder,outpt_file)

# assigneeScoresLoc2.to_csv(a_full,index=False)

# Extract the Earliest First Patent for Each Assignee and Replace the Existing Date with the Extract Date

In [45]:
### start timer
t0=time.time()

assigneeScores3['incorporation_date'] = pd.to_datetime(assigneeScores3['incorporation_date'])
assigneeScores3['incorporation_date'] = assigneeScores3['incorporation_date'].dt.strftime('%m/%d/%Y')
assigneeScores3['incorporation_date'] = pd.to_datetime(assigneeScores3['incorporation_date'])

assigneeScores4=assigneeScores3.groupby(by=['assignee_id'],as_index=False)
assigneeScores5=assigneeScores4.agg(min_date=('incorporation_date',np.min))

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

display(assigneeScores5.info(),assigneeScores5.head())

Total time is 0.001252 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 377 entries, 0 to 376
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   assignee_id  377 non-null    object        
 1   min_date     377 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(1)
memory usage: 8.8+ KB


None

Unnamed: 0,assignee_id,min_date
0,00059b74-f414-43cb-a36e-e30fb41c9285,1979-02-15
1,004053ef-9a48-4d23-9fd9-93ff47b7c299,2019-04-26
2,00de4104-b930-4719-a790-18e59618ac05,2020-09-02
3,0116426f-e05d-4c31-bed1-5e811c29400a,2000-02-17
4,01756cf3-dcd4-4332-a3ff-a240e090c08f,2007-10-09


In [46]:
### start timer
t0=time.time()

assigneeScores6=assigneeScores5.merge(assigneeScores3,left_on=['assignee_id'],
                                      right_on=['assignee_id'],how='inner')
assigneeScores7=assigneeScores6.iloc[:,[2,0,3,4,5,6,7,8,9,10,11,12,1,14,15,16,17,18,19]].sort_values(by=['ID'])
assigneeScores7.rename(columns={'min_date':'minIncDate'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

display(assigneeScores7.info(),assigneeScores7.head())

Total time is 0.000133 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1523 entries, 1460 to 542
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ID              1523 non-null   int64         
 1   assignee_id     1523 non-null   object        
 2   location_id     1523 non-null   object        
 3   organization    1523 non-null   object        
 4   city            1523 non-null   object        
 5   state           1523 non-null   object        
 6   dateOfFirstPat  1523 non-null   object        
 7   nameScores      1523 non-null   int64         
 8   matchNames      1523 non-null   object        
 9   subJurisCode    1523 non-null   object        
 10  subCntlEntity   852 non-null    object        
 11  branch_status   915 non-null    object        
 12  minIncDate      1523 non-null   datetime64[ns]
 13  address_city    761 non-null    object        
 14  address_state   754 non-nu

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,nameScores,matchNames,subJurisCode,subCntlEntity,branch_status,minIncDate,address_city,address_state,agent_city,agent_state,data_city,data_state
1460,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,100,The Egg Factory,Va,,,1997-12-22,Roanoke,Va,Roanoke,Va,[],[]
1456,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,100,Tank Vision,Ri,De,branch of an out-of-jurisdiction company,2017-01-03,Vineyard Haven,Ma,Providence,Ri,[],[]
1455,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,100,Tank Vision,De,,,2017-01-03,,,,,[],[]
1428,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,12/28/2012,100,Infinibox,De,,,2013-01-31,,,,,[],[]
1415,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,10/23/1980,100,Agricultural Aviation Engineering Company,Ca,,,1958-05-01,Las Vegas,Nv,,,[Las Vegas],[Nv]


In [48]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000ExtAltPreNamesDataCityStatesFuzzyNameScoresPreFilteredNew.csv"
a_full = os.path.join(res_folder,outpt_file)

# assigneeScores7.to_csv(a_full,index=False)

In [49]:
### determine the number of unique assignee_ids in the data
assigneeScores7['assignee_id'].nunique()

377

In [41]:
# ### group the data by assignee_id and select the record with the highest fuzzy score
# assigneeScores=pd.DataFrame(assingeeIDMerge1.groupby(by=['assignee_id'],as_index=False)['nameScores'].max())

# ### print general stats and first 5 records for dataset
# display(assigneeScores.info(),assigneeScores.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 387 entries, 0 to 386
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   assignee_id  387 non-null    object
 1   nameScores   387 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 9.1+ KB


None

Unnamed: 0,assignee_id,nameScores
0,00059b74-f414-43cb-a36e-e30fb41c9285,100
1,004053ef-9a48-4d23-9fd9-93ff47b7c299,100
2,00de4104-b930-4719-a790-18e59618ac05,100
3,0116426f-e05d-4c31-bed1-5e811c29400a,100
4,01756cf3-dcd4-4332-a3ff-a240e090c08f,100


In [42]:
# ### start timer
# t0=time.time()

# ### perform an inner merge with the previous data frame constructed by the groupby function with the assingeeIDMerge1
# ### data using the assignee_id and nameScores as matching features, select the columns to keep, and sort the data
# ### by the ID
# assigneeScores1=assigneeScores.merge(assingeeIDMerge1,on=['assignee_id','nameScores'],
#                                      how='inner').iloc[:,[2,0,3,4,5,6,7,1,9,8,10,11,12,13,14,15,16,17,18]]

# ### end timer and print total time
# t1=time.time()
# total=t1-t0
# print("Total time is %4f" % (total/60), "mins")

# ### print general stats and first 5 records for dataset
# display(assigneeScores1.info(),assigneeScores1.head())

Total time is 0.000098 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1564 entries, 0 to 1563
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype 
---  ------                                --------------  ----- 
 0   ID                                    1564 non-null   int64 
 1   assignee_id                           1564 non-null   object
 2   location_id                           1564 non-null   object
 3   organization                          1564 non-null   object
 4   city                                  1564 non-null   object
 5   state                                 1564 non-null   object
 6   dateOfFirstPat                        1564 non-null   object
 7   nameScores                            1564 non-null   int64 
 8   jurisdiction_code                     1564 non-null   object
 9   matchNames                            1564 non-null   object
 10  incorporation_date                    1564 non-null   object
 11  co

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,nameScores,jurisdiction_code,matchNames,incorporation_date,controlling_entity_jurisdiction_code,branch_status,address_city,address_state,agent_city,agent_state,data_city,data_state
0,49989,00059b74-f414-43cb-a36e-e30fb41c9285,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,Qbe Holdings,New York,Ny,10/15/2009,100,us_pa,Qbe Holdings,4/25/2012,us_de,branch of an out-of-jurisdiction company,,,,,[],[]
1,49989,00059b74-f414-43cb-a36e-e30fb41c9285,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,Qbe Holdings,New York,Ny,10/15/2009,100,us_de,Qbe Holdings,2/15/1979,,,,,,,[],[]
2,49989,00059b74-f414-43cb-a36e-e30fb41c9285,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,Qbe Holdings,New York,Ny,10/15/2009,100,us_ny,Qbe Holdings,3/6/1979,us_de,branch of an out-of-jurisdiction company,New York,Ny,,,[New York],[Ny]
3,49989,00059b74-f414-43cb-a36e-e30fb41c9285,fe67d7f9-cb8f-11eb-9615-121df0c29c1e,Qbe Holdings,New York,Ny,10/15/2009,100,us_fl,Qbe Holdings,1/17/2008,us_de,branch of an out-of-jurisdiction company,,,,,"[New York, New York]","[Ny, Ny]"
4,256004,004053ef-9a48-4d23-9fd9-93ff47b7c299,ff4c2272-cb8e-11eb-9615-121df0c29c1e,Cora Aero,Mountain View,Ca,07/26/2017,100,us_de,Cora Aero,4/26/2019,,,,,,,[],[]


In [43]:
# ### determine the number of unique assignee_ids in the data
# assigneeScores1['assignee_id'].nunique()

387

# Label Each Record with a State Match and Branch Status Code

In [50]:
### start timer
t0=time.time()

### feature construction was done by comparing the state feature from PatentsView to the state under
### the subJurisCode feature. If the states match, the record was given a code of zero; if the states
### do not match AND the subJurisCode state was not Delaware, the record was given a code of one; and
### if any record had a subJurisCode equal to Delaware, the record was given a code of two
assigneeScores7['stateMatch']=np.where(assigneeScores7['state'] == assigneeScores7['subJurisCode'],0,
                                       np.where((assigneeScores7['state'] != assigneeScores7['subJurisCode']) &
                                                (assigneeScores7['subJurisCode'] != 'De'),1,2))

assigneeScores7['bchStatus']=np.where(assigneeScores7['branch_status'] != "branch of an out-of-jurisdiction company",0,
                                      np.where((assigneeScores7['branch_status'] == "branch of an out-of-jurisdiction company") &
                                                (assigneeScores7['subCntlEntity'] != 'De'),1,2))                                    

assigneeScores8=assigneeScores7.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,19,20,12,13,14,15,16,17,18]].reset_index(drop=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(assigneeScores8.info(),assigneeScores8.head())

Total time is 0.000116 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1523 entries, 0 to 1522
Data columns (total 20 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ID              1523 non-null   int64         
 1   assignee_id     1523 non-null   object        
 2   location_id     1523 non-null   object        
 3   organization    1523 non-null   object        
 4   city            1523 non-null   object        
 5   state           1523 non-null   object        
 6   dateOfFirstPat  1523 non-null   object        
 7   nameScores      1523 non-null   int64         
 8   matchNames      1523 non-null   object        
 9   subJurisCode    1523 non-null   object        
 10  subCntlEntity   852 non-null    object        
 11  stateMatch      1523 non-null   int32         
 12  bchStatus       1523 non-null   int32         
 13  minIncDate      1523 non-null   datetime64[ns]
 14  address_city    761 non-null

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,nameScores,matchNames,subJurisCode,subCntlEntity,stateMatch,bchStatus,minIncDate,address_city,address_state,agent_city,agent_state,data_city,data_state
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,01/20/1999,100,The Egg Factory,Va,,0,0,1997-12-22,Roanoke,Va,Roanoke,Va,[],[]
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,100,Tank Vision,Ri,De,1,2,2017-01-03,Vineyard Haven,Ma,Providence,Ri,[],[]
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,01/12/2017,100,Tank Vision,De,,2,0,2017-01-03,,,,,[],[]
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,12/28/2012,100,Infinibox,De,,2,0,2013-01-31,,,,,[],[]
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,10/23/1980,100,Agricultural Aviation Engineering Company,Ca,,1,0,1958-05-01,Las Vegas,Nv,,,[Las Vegas],[Nv]


In [51]:
### calculate the number for each category under the stateMatch column (e.g., 0, 1, or 2)
print("# of records with matching states:     ", assigneeScores8['stateMatch'].value_counts()[0])
print("# of records with non-matching states: ", assigneeScores8['stateMatch'].value_counts()[1])
print("# of records with matching to Delaware:", assigneeScores8['stateMatch'].value_counts()[2])
print("The total number of records:           ", assigneeScores8['stateMatch'].count())

# of records with matching states:      156
# of records with non-matching states:  1200
# of records with matching to Delaware: 167
The total number of records:            1523


In [52]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000ExtAltPreNamesDataCityStatesFuzzyNameScoresPreFilteredLabelStatesNew.csv"
a_full = os.path.join(res_folder,outpt_file)

# assigneeScores8.to_csv(a_full,index=False)

# Add Coordinates to Locations

In [34]:
### start timer
t0=time.time()

### set the path for the input file and save to variable
res_folder = "../sourceFiles/"
input_file = "location_suppLatLong1.tsv"
a_full=os.path.join(res_folder,input_file)
print(a_full,"\n")

latLong=pd.read_csv(a_full,sep="\t").iloc[:,[1,2,3,5,6]]

### capitalize the first letter in each string for the city and state
latLong['city']=latLong['city'].str.title()
latLong['state']=latLong['state'].str.title()

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(latLong.info(null_counts=True),latLong.head())

../sourceFiles/location_suppLatLong1.tsv 

Total time is 0.000900 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32945 entries, 0 to 32944
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   location_id  32945 non-null  object 
 1   city         32945 non-null  object 
 2   state        32945 non-null  object 
 3   latitude     32945 non-null  float64
 4   longitude    32945 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.3+ MB


None

Unnamed: 0,location_id,city,state,latitude,longitude
0,00006da3-cb90-11eb-9615-121df0c29c1e,Alder,Mt,45.3247,-112.108
1,00047c6a-cb91-11eb-9615-121df0c29c1e,Knowles,Ok,36.8734,-100.193
2,0005c1ab-cb8f-11eb-9615-121df0c29c1e,Court Florency,Ky,38.0393,-84.4862
3,00171e6a-cb90-11eb-9615-121df0c29c1e,Mount Herman,Nj,39.6176,-74.5943
4,001ee951-cb91-11eb-9615-121df0c29c1e,Watauga County,Nc,36.2514,-81.7044


In [35]:
### start timer
t0=time.time()

### perform an outer merge to attach coordinates with the assigneeScores7 dataframe using the
### location_id, city, and state features as merging columns
cityLatLong=assigneeScoresLoc2.merge(latLong,on=['location_id','city','state'],how='outer',indicator=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(cityLatLong.info(),cityLatLong.head())

Total time is 0.001084 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 34133 entries, 0 to 34132
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ID              1558 non-null   float64       
 1   assignee_id     1558 non-null   object        
 2   location_id     34133 non-null  object        
 3   organization    1558 non-null   object        
 4   city            34107 non-null  object        
 5   state           34109 non-null  object        
 6   dateOfFirstPat  1558 non-null   object        
 7   nameScores      1558 non-null   float64       
 8   matchNames      1558 non-null   object        
 9   subJurisCode    1558 non-null   object        
 10  subCntlEntity   870 non-null    object        
 11  stateMatch      1558 non-null   float64       
 12  bchStatus       1558 non-null   float64       
 13  minIncDate      1558 non-null   datetime64[ns]
 14  address_city    780 non-nu

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,nameScores,matchNames,subJurisCode,...,address_city,address_state,agent_city,agent_state,data_city,data_state,orgLoc,latitude,longitude,_merge
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,1/30/2001,100.0,The Egg Factory,Va,...,Roanoke,Va,Roanoke,Va,,,Va,37.2738,-79.9602,both
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,10/16/2018,100.0,Tank Vision,Ri,...,Vineyard Haven,Ma,Providence,Ri,,,"Ri,De",41.4543,-70.6038,both
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,10/16/2018,100.0,Tank Vision,De,...,,,,,,,"Ri,De",41.4543,-70.6038,both
3,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,6/27/2017,100.0,Infinibox,De,...,,,,,,,De,42.2187,-71.2026,both
4,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,3/15/1983,100.0,Agricultural Aviation Engineering Company,Ca,...,Las Vegas,Nv,,,Las Vegas,Nv,Ca,36.1716,-115.1391,both


In [36]:
### start timer
t0=time.time()

### there was not a 1 to 1 match when setting the how parameter to inner in the merge function in the
### above cell. Therefore, the outer option was used for this parameter and the _merge column was
### used to select the records designated as both or left_only. These selections were concatenated
### to generate the cityLatLong dataset, but with the coordinates for the patentsview city feature
### added to the data
cityLatLong1=pd.concat([cityLatLong.loc[cityLatLong['_merge']=='both'],cityLatLong.loc[cityLatLong['_merge']=='left_only']],
                       axis=0)

### columns were selected for further processing and the lat/long names were renamed
cityLatLong2=cityLatLong1.iloc[:,[0,1,2,3,4,5,21,22,6,7,8,9,10,
                                  11,12,13,14,15,16,17,18,19,20]].sort_values(by=['ID']).reset_index(drop=True)
cityLatLong2['ID']=cityLatLong2['ID'].astype('int')
cityLatLong2.rename(columns={'latitude':'city_latitude','longitude':'city_longitude'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(cityLatLong2.info(),cityLatLong2.head())

Total time is 0.000150 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1558 entries, 0 to 1557
Data columns (total 23 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ID              1558 non-null   int32         
 1   assignee_id     1558 non-null   object        
 2   location_id     1558 non-null   object        
 3   organization    1558 non-null   object        
 4   city            1532 non-null   object        
 5   state           1534 non-null   object        
 6   city_latitude   1486 non-null   float64       
 7   city_longitude  1486 non-null   float64       
 8   dateOfFirstPat  1558 non-null   object        
 9   nameScores      1558 non-null   float64       
 10  matchNames      1558 non-null   object        
 11  subJurisCode    1558 non-null   object        
 12  subCntlEntity   870 non-null    object        
 13  stateMatch      1558 non-null   float64       
 14  bchStatus       1558 non-nul

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,stateMatch,bchStatus,minIncDate,address_city,address_state,agent_city,agent_state,data_city,data_state,orgLoc
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1/30/2001,100.0,...,0.0,0.0,1997-12-22,Roanoke,Va,Roanoke,Va,,,Va
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,1.0,2.0,2017-01-03,Vineyard Haven,Ma,Providence,Ri,,,"Ri,De"
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,2.0,0.0,2017-01-03,,,,,,,"Ri,De"
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,6/27/2017,100.0,...,2.0,0.0,2013-01-31,,,,,,,De
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,3/15/1983,100.0,...,1.0,0.0,1958-05-01,Las Vegas,Nv,,,Las Vegas,Nv,Ca


In [37]:
### start timer
t0=time.time()

### perform an outer merge to attach coordinates with the cityLatLong2 dataframe using the
### city and state features as merging columns. This will add coordinates that correspond
### to the address_city/state features
addLatLong=cityLatLong2.merge(latLong,left_on=['address_city','address_state'],right_on=['city','state'],
                              how='outer',indicator=True)

### dropped features and renamed column names
addLatLong.drop(columns=['city_y','state_y','location_id_y'],inplace=True)
addLatLong.rename(columns={'city_x':'city','state_x':'state','location_id_x':'location_id'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(addLatLong.info(),addLatLong.head())

Total time is 0.001182 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 34262 entries, 0 to 34261
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   ID              1573 non-null   float64       
 1   assignee_id     1573 non-null   object        
 2   location_id     1573 non-null   object        
 3   organization    1573 non-null   object        
 4   city            1547 non-null   object        
 5   state           1549 non-null   object        
 6   city_latitude   1501 non-null   float64       
 7   city_longitude  1501 non-null   float64       
 8   dateOfFirstPat  1573 non-null   object        
 9   nameScores      1573 non-null   float64       
 10  matchNames      1573 non-null   object        
 11  subJurisCode    1573 non-null   object        
 12  subCntlEntity   880 non-null    object        
 13  stateMatch      1573 non-null   float64       
 14  bchStatus       1573 non-n

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,address_city,address_state,agent_city,agent_state,data_city,data_state,orgLoc,latitude,longitude,_merge
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1/30/2001,100.0,...,Roanoke,Va,Roanoke,Va,,,Va,37.2738,-79.9602,both
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,Vineyard Haven,Ma,Providence,Ri,,,"Ri,De",41.4543,-70.6038,both
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,,,,,,,"Ri,De",,,left_only
3,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,6/27/2017,100.0,...,,,,,,,De,,,left_only
4,2729.0,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,38.8937,-76.9879,7/26/2016,100.0,...,,,,,,,"De,Tn",,,left_only


In [38]:
### start timer
t0=time.time()

### there was not a 1 to 1 match when setting the how parameter to inner in the merge function in the
### above cell. Therefore, the outer option was used for this parameter and the _merge column was
### used to select the records designated as both or left_only. These selections were concatenated
### to generate the cityLatLong dataset, but with the coordinates for the patentsview city feature
### added to the data
addLatLong1=pd.concat([addLatLong.loc[addLatLong['_merge']=='both'],addLatLong.loc[addLatLong['_merge']=='left_only']],
                       axis=0).sort_values(by=['ID']).reset_index(drop=True)

addLatLong2=addLatLong1.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,23,24,18,19,20,21,22]]
addLatLong2.rename(columns={'latitude':'address_latitude','longitude':'address_longitude'},inplace=True)

addLatLong2['ID']=addLatLong2['ID'].astype(int)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(addLatLong2.info(),addLatLong2.head())

Total time is 0.000183 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1573 entries, 0 to 1572
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ID                 1573 non-null   int32         
 1   assignee_id        1573 non-null   object        
 2   location_id        1573 non-null   object        
 3   organization       1573 non-null   object        
 4   city               1547 non-null   object        
 5   state              1549 non-null   object        
 6   city_latitude      1501 non-null   float64       
 7   city_longitude     1501 non-null   float64       
 8   dateOfFirstPat     1573 non-null   object        
 9   nameScores         1573 non-null   float64       
 10  matchNames         1573 non-null   object        
 11  subJurisCode       1573 non-null   object        
 12  subCntlEntity      880 non-null    object        
 13  stateMatch         1573 non-null   

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,minIncDate,address_city,address_state,address_latitude,address_longitude,agent_city,agent_state,data_city,data_state,orgLoc
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1/30/2001,100.0,...,1997-12-22,Roanoke,Va,37.2738,-79.9602,Roanoke,Va,,,Va
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,2017-01-03,Vineyard Haven,Ma,41.4543,-70.6038,Providence,Ri,,,"Ri,De"
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,2017-01-03,,,,,,,,,"Ri,De"
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,6/27/2017,100.0,...,2013-01-31,,,,,,,,,De
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,3/15/1983,100.0,...,1958-05-01,Las Vegas,Nv,36.1716,-115.1391,,,Las Vegas,Nv,Ca


In [39]:
### start timer
t0=time.time()

### perform an outer merge to attach coordinates with the addLatLong2 dataframe using the
### city and state features as merging columns. This will add coordinates that correspond
### to the agent_city/state features
agtLatLong=addLatLong2.merge(latLong,left_on=['agent_city','agent_state'],right_on=['city','state'],
                              how='outer',indicator=True)

### dropped features and renamed column names
agtLatLong.drop(columns=['city_y','state_y','location_id_y'],inplace=True)
agtLatLong.rename(columns={'city_x':'city','state_x':'state','location_id_x':'location_id'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(agtLatLong.info(),agtLatLong.head())

Total time is 0.001022 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 34444 entries, 0 to 34443
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ID                 1576 non-null   float64       
 1   assignee_id        1576 non-null   object        
 2   location_id        1576 non-null   object        
 3   organization       1576 non-null   object        
 4   city               1550 non-null   object        
 5   state              1552 non-null   object        
 6   city_latitude      1504 non-null   float64       
 7   city_longitude     1504 non-null   float64       
 8   dateOfFirstPat     1576 non-null   object        
 9   nameScores         1576 non-null   float64       
 10  matchNames         1576 non-null   object        
 11  subJurisCode       1576 non-null   object        
 12  subCntlEntity      880 non-null    object        
 13  stateMatch         1576 non-null 

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,address_latitude,address_longitude,agent_city,agent_state,data_city,data_state,orgLoc,latitude,longitude,_merge
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1/30/2001,100.0,...,37.2738,-79.9602,Roanoke,Va,,,Va,37.2738,-79.9602,both
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,41.4543,-70.6038,Providence,Ri,,,"Ri,De",41.8171,-71.4282,both
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,,,,,,,"Ri,De",,,left_only
3,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,6/27/2017,100.0,...,,,,,,,De,,,left_only
4,2655.0,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,3/15/1983,100.0,...,36.1716,-115.1391,,,Las Vegas,Nv,Ca,,,left_only


In [40]:
### start timer
t0=time.time()

### there was not a 1 to 1 match when setting the how parameter to inner in the merge function in the
### above cell. Therefore, the outer option was used for this parameter and the _merge column was
### used to select the records designated as both or left_only. These selections were concatenated
### to generate the cityLatLong dataset, but with the coordinates for the patentsview city feature
### added to the data
agtLatLong1=pd.concat([agtLatLong.loc[agtLatLong['_merge']=='both'],agtLatLong.loc[agtLatLong['_merge']=='left_only']],
                       axis=0).sort_values(by=['ID']).reset_index(drop=True)

### dropped features and renamed column names
agtLatLong2=agtLatLong1.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,25,26,22,23,24]]
agtLatLong2.rename(columns={'latitude':'agent_latitude','longitude':'agent_longitude'},inplace=True)

agtLatLong2['ID']=agtLatLong2['ID'].astype(int)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(agtLatLong2.info(),agtLatLong2.head())

Total time is 0.000151 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1576 entries, 0 to 1575
Data columns (total 27 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ID                 1576 non-null   int32         
 1   assignee_id        1576 non-null   object        
 2   location_id        1576 non-null   object        
 3   organization       1576 non-null   object        
 4   city               1550 non-null   object        
 5   state              1552 non-null   object        
 6   city_latitude      1504 non-null   float64       
 7   city_longitude     1504 non-null   float64       
 8   dateOfFirstPat     1576 non-null   object        
 9   nameScores         1576 non-null   float64       
 10  matchNames         1576 non-null   object        
 11  subJurisCode       1576 non-null   object        
 12  subCntlEntity      880 non-null    object        
 13  stateMatch         1576 non-null   

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,address_state,address_latitude,address_longitude,agent_city,agent_state,agent_latitude,agent_longitude,data_city,data_state,orgLoc
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1/30/2001,100.0,...,Va,37.2738,-79.9602,Roanoke,Va,37.2738,-79.9602,,,Va
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,,,,,,,,,,"Ri,De"
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,Ma,41.4543,-70.6038,Providence,Ri,41.8171,-71.4282,,,"Ri,De"
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,6/27/2017,100.0,...,,,,,,,,,,De
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,3/15/1983,100.0,...,Nv,36.1716,-115.1391,,,,,Las Vegas,Nv,Ca


In [41]:
### start timer
t0=time.time()

### perform an outer merge to attach coordinates with the agtLatLong2 dataframe using the
### city and state features as merging columns. This will add coordinates that correspond
### to the data_city/state features
dataLatLong=agtLatLong2.merge(latLong,left_on=['data_city','data_state'],right_on=['city','state'],
                               how='outer',indicator=True)

### dropped features and renamed column names
dataLatLong.drop(columns=['city_y','state_y','location_id_y'],inplace=True)
dataLatLong.rename(columns={'city_x':'city','state_x':'state','location_id_x':'location_id'},inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong.info(),dataLatLong.head())

Total time is 0.001419 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 34431 entries, 0 to 34430
Data columns (total 30 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ID                 1578 non-null   float64       
 1   assignee_id        1578 non-null   object        
 2   location_id        1578 non-null   object        
 3   organization       1578 non-null   object        
 4   city               1552 non-null   object        
 5   state              1554 non-null   object        
 6   city_latitude      1506 non-null   float64       
 7   city_longitude     1506 non-null   float64       
 8   dateOfFirstPat     1578 non-null   object        
 9   nameScores         1578 non-null   float64       
 10  matchNames         1578 non-null   object        
 11  subJurisCode       1578 non-null   object        
 12  subCntlEntity      880 non-null    object        
 13  stateMatch         1578 non-null 

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,agent_city,agent_state,agent_latitude,agent_longitude,data_city,data_state,orgLoc,latitude,longitude,_merge
0,875.0,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1/30/2001,100.0,...,Roanoke,Va,37.2738,-79.9602,,,Va,,,left_only
1,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,,,,,,,"Ri,De",,,left_only
2,1284.0,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,Providence,Ri,41.8171,-71.4282,,,"Ri,De",,,left_only
3,1667.0,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,6/27/2017,100.0,...,,,,,,,De,,,left_only
4,2729.0,e5614631-da49-4351-a6c6-8f81358b767d,fb7257e4-cb8f-11eb-9615-121df0c29c1e,Fanamana,Washington,Dc,38.8937,-76.9879,7/26/2016,100.0,...,,,,,,,"De,Tn",,,left_only


In [42]:
### start timer
t0=time.time()

### there was not a 1 to 1 match when setting the how parameter to inner in the merge function in the
### above cell. Therefore, the outer option was used for this parameter and the _merge column was
### used to select the records designated as both or left_only. These selections were concatenated
### to generate the cityLatLong dataset, but with the coordinates for the patentsview city feature
### added to the data
dataLatLong1=pd.concat([dataLatLong.loc[dataLatLong['_merge']=='both'],dataLatLong.loc[dataLatLong['_merge']=='left_only']],
                        axis=0).sort_values(by=['ID']).reset_index(drop=True)

### dropped features and renamed column names
dataLatLong2=dataLatLong1.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,27,28,26]]
dataLatLong2.rename(columns={'latitude':'data_latitude','longitude':'data_longitude'},inplace=True)

dataLatLong2['ID']=dataLatLong2['ID'].astype(int)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong2.info(),dataLatLong2.head())

Total time is 0.000233 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578 entries, 0 to 1577
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   ID                 1578 non-null   int32         
 1   assignee_id        1578 non-null   object        
 2   location_id        1578 non-null   object        
 3   organization       1578 non-null   object        
 4   city               1552 non-null   object        
 5   state              1554 non-null   object        
 6   city_latitude      1506 non-null   float64       
 7   city_longitude     1506 non-null   float64       
 8   dateOfFirstPat     1578 non-null   object        
 9   nameScores         1578 non-null   float64       
 10  matchNames         1578 non-null   object        
 11  subJurisCode       1578 non-null   object        
 12  subCntlEntity      880 non-null    object        
 13  stateMatch         1578 non-null   

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,address_longitude,agent_city,agent_state,agent_latitude,agent_longitude,data_city,data_state,data_latitude,data_longitude,orgLoc
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1/30/2001,100.0,...,-79.9602,Roanoke,Va,37.2738,-79.9602,,,,,Va
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,-70.6038,Providence,Ri,41.8171,-71.4282,,,,,"Ri,De"
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,,,,,,,,,,"Ri,De"
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,6/27/2017,100.0,...,,,,,,,,,,De
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,3/15/1983,100.0,...,-115.1391,,,,,Las Vegas,Nv,36.1716,-115.1391,Ca


In [43]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000ExtAltPreNamesDataCityStatesFuzzyNameScoresPreFilteredLabelStatesMatchCityStatesAddCoor.csv"
a_full = os.path.join(res_folder,outpt_file)

# dataLatLong2.to_csv(a_full,index=False)

# Calculate Distances Between PatentsView Cities and OpenCorporate Cities

In [44]:
### import the required library to calculate distances
from geopy import distance

### start timer
t0=time.time()

### the distance between cities are calculated by utilizing the city_lat/city_long and the add_lat/add_long,
### agt_lat/agt_long, and data_lat/data_long features.
cityAddrCor=[]
cityAgtCor=[]
cityDataCor=[]

### each for loop below calculates the distance between the city found in PatentsView and the
### cities from the address_city, agent_city, and data_city. The try/except is included to deal
### with exception errors
for p in range(len(dataLatLong2)):
    
    try:
        cityCor = (dataLatLong2.iloc[p,6], dataLatLong2.iloc[p,7])
        cityAddressCor = (dataLatLong2.iloc[p,18], dataLatLong2.iloc[p,19])

        cityAddrCor.append(distance.distance(cityCor, cityAddressCor).miles)
    
    except:
        cityAddrCor.append(np.nan)
        
for p in range(len(dataLatLong2)):
    
    try:
        cityCor = (dataLatLong2.iloc[p,6], dataLatLong2.iloc[p,7])
        cityAgentCor = (dataLatLong2.iloc[p,22], dataLatLong2.iloc[p,23])

        cityAgtCor.append(distance.distance(cityCor, cityAgentCor).miles)
    
    except:
        cityAgtCor.append(np.nan)

for p in range(len(dataLatLong2)):
    
    try:
        cityCor = (dataLatLong2.iloc[p,6], dataLatLong2.iloc[p,7])
        cityDatCor = (dataLatLong2.iloc[p,26], dataLatLong2.iloc[p,27])

        cityDataCor.append(distance.distance(cityCor, cityDatCor).miles)
    
    except:
        cityDataCor.append(np.nan)
        
### add the distance to the input dataframe and round the value
cityAddrCor1=[round(num, 1) for num in cityAddrCor]
cityAgtCor1=[round(num1, 1) for num1 in cityAgtCor]
cityDataCor1=[round(num1, 1) for num1 in cityDataCor]

dataLatLong2['cityToAddrDistance'] = cityAddrCor1
dataLatLong2['cityToAgtDistance'] = cityAgtCor1
dataLatLong2['cityToDataDistance'] = cityDataCor1

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong2.info(),dataLatLong2.head())

Total time is 0.007813 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578 entries, 0 to 1577
Data columns (total 32 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  1578 non-null   int32         
 1   assignee_id         1578 non-null   object        
 2   location_id         1578 non-null   object        
 3   organization        1578 non-null   object        
 4   city                1552 non-null   object        
 5   state               1554 non-null   object        
 6   city_latitude       1506 non-null   float64       
 7   city_longitude      1506 non-null   float64       
 8   dateOfFirstPat      1578 non-null   object        
 9   nameScores          1578 non-null   float64       
 10  matchNames          1578 non-null   object        
 11  subJurisCode        1578 non-null   object        
 12  subCntlEntity       880 non-null    object        
 13  stateMatch          

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,agent_latitude,agent_longitude,data_city,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,1/30/2001,100.0,...,37.2738,-79.9602,,,,,Va,0.0,0.0,
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,41.8171,-71.4282,,,,,"Ri,De",0.0,49.5,
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,10/16/2018,100.0,...,,,,,,,"Ri,De",,,
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,6/27/2017,100.0,...,,,,,,,De,,,
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,3/15/1983,100.0,...,,,Las Vegas,Nv,36.1716,-115.1391,Ca,0.0,,0.0


# Calculate Difference Between Incorporation and First Patent Granted Dates

In [45]:
### import the required library to calculate time differences
from datetime import timedelta

### start timer
t0=time.time()

### the time between the first patent granted and incorporation date is calculated and appended to the
### final dataset before scoring

dataLatLong2['dateOfFirstPat'] = pd.to_datetime(dataLatLong2['dateOfFirstPat'])
dataLatLong2['minIncDate'] = pd.to_datetime(dataLatLong2['minIncDate'],errors='coerce')

dataLatLong2['dateDiff']=abs(dataLatLong2['minIncDate'] - dataLatLong2['dateOfFirstPat'])/ timedelta(days=365)
dataLatLong2['dateDiff']=[round(num, 2) for num in dataLatLong2['dateDiff']]

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong2.info(),dataLatLong2.head())

Total time is 0.000400 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578 entries, 0 to 1577
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  1578 non-null   int32         
 1   assignee_id         1578 non-null   object        
 2   location_id         1578 non-null   object        
 3   organization        1578 non-null   object        
 4   city                1552 non-null   object        
 5   state               1554 non-null   object        
 6   city_latitude       1506 non-null   float64       
 7   city_longitude      1506 non-null   float64       
 8   dateOfFirstPat      1578 non-null   datetime64[ns]
 9   nameScores          1578 non-null   float64       
 10  matchNames          1578 non-null   object        
 11  subJurisCode        1578 non-null   object        
 12  subCntlEntity       880 non-null    object        
 13  stateMatch          

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,agent_longitude,data_city,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,2001-01-30,100.0,...,-79.9602,,,,,Va,0.0,0.0,,3.11
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,-71.4282,,,,,"Ri,De",0.0,49.5,,1.78
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,,,"Ri,De",,,,1.78
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2017-06-27,100.0,...,,,,,,De,,,,4.41
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1983-03-15,100.0,...,,Las Vegas,Nv,36.1716,-115.1391,Ca,0.0,,0.0,24.89


In [46]:
### start timer
t0=time.time()

### drop records with no city or state in the PatentsView data
dataLatLong2.dropna(subset=['city','state'],inplace=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(dataLatLong2.info(),dataLatLong2.head())

Total time is 0.000067 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1551 entries, 0 to 1577
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  1551 non-null   int32         
 1   assignee_id         1551 non-null   object        
 2   location_id         1551 non-null   object        
 3   organization        1551 non-null   object        
 4   city                1551 non-null   object        
 5   state               1551 non-null   object        
 6   city_latitude       1506 non-null   float64       
 7   city_longitude      1506 non-null   float64       
 8   dateOfFirstPat      1551 non-null   datetime64[ns]
 9   nameScores          1551 non-null   float64       
 10  matchNames          1551 non-null   object        
 11  subJurisCode        1551 non-null   object        
 12  subCntlEntity       863 non-null    object        
 13  stateMatch          

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,agent_longitude,data_city,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,2001-01-30,100.0,...,-79.9602,,,,,Va,0.0,0.0,,3.11
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,-71.4282,,,,,"Ri,De",0.0,49.5,,1.78
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,,,"Ri,De",,,,1.78
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2017-06-27,100.0,...,,,,,,De,,,,4.41
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1983-03-15,100.0,...,,Las Vegas,Nv,36.1716,-115.1391,Ca,0.0,,0.0,24.89


# Obtain Value Counts for Matching and Non-Matching States and Branch Status

In [47]:
print("The number of state-to-state matches equal:", dataLatLong2.stateMatch.value_counts().sort_index()[0])
print("The number of state-to-other-state but not DE matches equal:", dataLatLong2.stateMatch.value_counts().sort_index()[1])
print("The number of state-to-DE matches equal:", dataLatLong2.stateMatch.value_counts().sort_index()[2])

The number of state-to-state matches equal: 154
The number of state-to-other-state but not DE matches equal: 1229
The number of state-to-DE matches equal: 168


In [48]:
dataLatLong2.bchStatus.value_counts().sort_index()
print("The number of records with no branch status is:", dataLatLong2.bchStatus.value_counts().sort_index()[0])
print("The number of records with with a branch status is:", 
      dataLatLong2.bchStatus.value_counts().sort_index()[1]+dataLatLong2.bchStatus.value_counts().sort_index()[2])

The number of records with no branch status is: 625
The number of records with with a branch status is: 926


# Find and Drop Duplicates

In [49]:
### start timer
t0=time.time()

dataLatLong3=dataLatLong2.drop_duplicates(keep='first')

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

display(dataLatLong3.info(),dataLatLong3.head())

Total time is 0.000117 mins
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1516 entries, 0 to 1577
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  1516 non-null   int32         
 1   assignee_id         1516 non-null   object        
 2   location_id         1516 non-null   object        
 3   organization        1516 non-null   object        
 4   city                1516 non-null   object        
 5   state               1516 non-null   object        
 6   city_latitude       1472 non-null   float64       
 7   city_longitude      1472 non-null   float64       
 8   dateOfFirstPat      1516 non-null   datetime64[ns]
 9   nameScores          1516 non-null   float64       
 10  matchNames          1516 non-null   object        
 11  subJurisCode        1516 non-null   object        
 12  subCntlEntity       853 non-null    object        
 13  stateMatch          

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,agent_longitude,data_city,data_state,data_latitude,data_longitude,orgLoc,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff
0,875,f7b31db8-0a3d-425e-a79f-d84ba6333b60,de5d6510-cb90-11eb-9615-121df0c29c1e,The Egg Factory,Roanoke,Va,37.2738,-79.9602,2001-01-30,100.0,...,-79.9602,,,,,Va,0.0,0.0,,3.11
1,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,-71.4282,,,,,"Ri,De",0.0,49.5,,1.78
2,1284,f3e6ae72-e2b1-4165-a0f7-9be773bf1e56,9447f283-cb8e-11eb-9615-121df0c29c1e,Tank Vision,Vineyard Haven,Ma,41.4543,-70.6038,2018-10-16,100.0,...,,,,,,"Ri,De",,,,1.78
3,1667,f0447e02-fb80-46f7-a97c-cb4b2f398aeb,4b56fc31-cb8e-11eb-9615-121df0c29c1e,Infinibox,Westwood,Ma,42.2187,-71.2026,2017-06-27,100.0,...,,,,,,De,,,,4.41
4,2655,e62b4591-f071-4597-b791-35e9075d2af1,f97aecf0-cb90-11eb-9615-121df0c29c1e,Agricultural Aviation Engineering Company,Las Vegas,Nv,36.1716,-115.1391,1983-03-15,100.0,...,,Las Vegas,Nv,36.1716,-115.1391,Ca,0.0,,0.0,24.89


In [50]:
### determine the number of unique assignee_ids in the data
dataLatLong3['assignee_id'].nunique()

384

In [51]:
### save the new dataset as a checkpoint
res_folder = "../csvResults/"
outpt_file = "OcResults1000PreparedForScoring.csv"
a_full = os.path.join(res_folder,outpt_file)

# dataLatLong3.to_csv(a_full,index=False)