## Scoring Algorithm

In [1]:
import pandas as pd
import numpy as np
import time
import os
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

### start timer
t0=time.time()

### set the path for the input file and save to variable
res_folder = "../csvResults/"
input_file = "OCResults0_274491_PreparedForScoring.csv"
a_full=os.path.join(res_folder,input_file)
print(a_full,"\n")

### imports the dfMergedFullDataSet file to merge onto Mike's output file using the ID column
df=pd.read_csv(a_full)
df['dateOfFirstPat'] = pd.to_datetime(df['dateOfFirstPat'])
df['minIncDate'] = pd.to_datetime(df['minIncDate'],errors='coerce')
df['minIncDateLoc'] = pd.to_datetime(df['minIncDateLoc'],errors='coerce')

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %4f" % (total/60), "mins")

### print general stats and first 5 records for dataset
display(df.info(null_counts=True),df.head())

../csvResults/OCResults0_274491_PreparedForScoring.csv 

Total time is 0.010391 mins
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108609 entries, 0 to 108608
Data columns (total 30 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   ID                  108609 non-null  int64         
 1   assignee_id         108609 non-null  object        
 2   location_id         108609 non-null  object        
 3   organization        108609 non-null  object        
 4   city                108609 non-null  object        
 5   state               108609 non-null  object        
 6   city_latitude       108016 non-null  float64       
 7   city_longitude      108016 non-null  float64       
 8   dateOfFirstPat      108609 non-null  datetime64[ns]
 9   nameScores          108533 non-null  float64       
 10  matchNames          108533 non-null  object        
 11  subJurisCode        108609 non-null  object        
 12  m

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,agent_latitude,agent_longitude,data_city,data_state,data_latitude,data_longitude,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff
0,50000,fffffbe5-22f1-41de-9dad-66fe6a86a176,f54d6149-cb8e-11eb-9615-121df0c29c1e,Nlb Water,Denver,Co,39.7647,-104.955,2019-02-26,100.0,...,,,Denver,Co,39.7647,-104.955,,,0.0,0.271233
1,50001,ffffed49-446e-4c9e-b195-f3aad9f49757,fe67264c-cb8f-11eb-9615-121df0c29c1e,Glucan Biorenewables,Saint Louis,Mo,38.6399,-90.5188,2013-12-20,100.0,...,,,,,,,0.0,,,1.706849
2,0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,Close In Solutions,Austin,Tx,30.3076,-97.7126,2005-05-02,100.0,...,,,Round Rock,Tx,30.5229,-97.6776,15.0,,15.0,4.906849
3,50003,fffe78c5-5266-4cdb-991b-b60ba440e04b,5991f79b-cb8e-11eb-9615-121df0c29c1e,Ethos International,Scottsburg,In,38.6878,-85.784,1997-01-30,100.0,...,,,Salem,In,38.602,-86.0996,18.1,,18.1,0.846575
4,50005,fffe51a8-c1af-40f0-a68d-78a06f26d67d,de09fd42-cb8e-11eb-9615-121df0c29c1e,Polylactane,Milwaukee,Wi,43.0563,-87.9323,1988-11-01,100.0,...,,,,,,,,,,1.890411


In [2]:
print(df['assignee_id'].nunique())
print(df['ID'].nunique())

108609
108609


In [3]:
totalScore=[]
d=len(df)
x=''

### score the fuzzy match percentages

for s in range(d):
    ### scores all records with a fuzzy match score with a 100%; account for organization
    ### name lengths (i.e., shorter names are scored lower than longer names)
    
    if df['nameScores'][s] == 100:

        if len(df['organization'][s]) < 5:
            x=0
        
        elif 5 <= len(df['organization'][s]) < 8:
            x=3
            
        elif len(df['organization'][s]) >= 8:
            x=5
    
    ### visual inspection of the data indicates a discrete group between 95% and 100%,
    ### resulting in the next set. Name lengths are again accounted for and shorter
    ### names/scores are downweighted even more
    
    elif 95 <= df['nameScores'][s] < 100:
        
        if len(df['organization'][s]) < 5:
            x=0
        
        elif 5 <= len(df['organization'][s]) < 10:
            x=1
            
        elif 10 <= len(df['organization'][s]) < 15:
            x=2
            
        elif len(df['organization'][s]) >= 15:
            x=3

    ### many of the fuzzy matches in this range are wrong, but there are a couple correct
    ### that should not be discounted. While the weights are not as high as the previous
    ### sections, correct matches will be given better scores than the next section
    
    elif 90 <= df['nameScores'][s] < 95:
        
        if len(df['organization'][s]) < 5:
            x=-1
        
        elif 5 <= len(df['organization'][s]) < 10:
            x=1
            
        elif 10 <= len(df['organization'][s]) < 15:
            x=2
            
        elif len(df['organization'][s]) >= 15:
            x=3

    ### very few are correct, but there are some misspellings that were not collected during
    ### the cleaning phase. Many of the records will be weighted down, but those that are
    ### longer in length will be given higher scores than names that are shorter
    
    elif 87 <= df['nameScores'][s] < 90:
        
        if len(df['organization'][s]) < 5:
            x=-1
        
        if 5 <= len(df['organization'][s]) < 10:
            x=0
            
        elif 10 <= len(df['organization'][s]) < 15:
            x=1
            
        elif len(df['organization'][s]) >= 15:
            x=2
        
    elif df['nameScores'][s] < 87:
        x=-7
        
    elif np.isnan(df['nameScores'][s]):
        x=-10

    ### scoring the different features that contain state information for each
    ### organization. The jurisdictionScore feature was created by extracting
    ### the state from the jurisdiction_code field. The jurisdiction_code
    ### feature is the primary metric used to match patentsview and OC records
    ### and therefore, given a larger weight. The stateAddScore is given the
    ### second highest weight because it is the primary address that is listed
    ### in an OC record. stateAgtScore is given the least amount of weight
    ### because the agent may not always be located at the registered address 
    ### for the organization.
    
    #####jurisdiction scoring was removed
    
    if df['state'][s] == df['address_state'][s]:
        x=x+5
    
        if df['city'][s] == df['address_city'][s]:
            x=x+5
            
        elif df['city'][s] == df['agent_city'][s]:
            x=x+5
            
        elif df['city'][s] == df['data_city'][s]:
            x=x+5
    
    elif df['state'][s] == df['agent_state'][s]:
        x=x+5
        
        if df['city'][s] == df['address_city'][s]:
            x=x+5
            
        elif df['city'][s] == df['agent_city'][s]:
            x=x+5
            
        elif df['city'][s] == df['data_city'][s]:
            x=x+5
    
    elif df['state'][s] == df['data_state'][s]:
        x=x+5
        
        if df['city'][s] == df['address_city'][s]:
            x=x+5
            
        elif df['city'][s] == df['agent_city'][s]:
            x=x+5
            
        elif df['city'][s] == df['data_city'][s]:
            x=x+5        
        
        
    ### the code below penalizes records with a first patent applied for date that is older
    ### than the incorporation date for that organization
    
    if df['dateOfFirstPat'][s] < df['minIncDate'][s]:
        
        if df['dateDiff'][s] > 3:
            x=x-5
            
    
    ### the address_city feature is weighted more than the agent_city column for similar
    ### reasons stated in the states section above. Cities less than 4 characters long
    ### are penalized and gradually score better as the character length increase. Moreover,
    ### the score from fuzzy matching is used to create groups as shown below. Fuzzy
    ### scores below 90% are weighted negatively
    

    if df['cityToAddrDistance'][s] == 0 or df['cityToAgtDistance'][s] == 0 or df['cityToDataDistance'][s] == 0:
        x=x+5
        
    elif 0 < df['cityToAddrDistance'][s] < 10 or 0 < df['cityToAgtDistance'][s] < 10 or 0 < df['cityToDataDistance'][s] < 10:
        x=x+5
    
    elif 10 <= df['cityToAddrDistance'][s] < 50 or 10 <= df['cityToAgtDistance'][s] < 50 or 10 <= df['cityToDataDistance'][s] < 50:
        x=x+4
        
    elif 50 <= df['cityToAddrDistance'][s] < 100 or 50 <= df['cityToAgtDistance'][s] < 100 or 50 <= df['cityToDataDistance'][s] < 100:
        x=x+2
        
    elif 100 <= df['cityToAddrDistance'][s] < 200 or 100 <= df['cityToAgtDistance'][s] < 200 or 100 <= df['cityToDataDistance'][s] < 200:
        x=x+1
    
    elif df['cityToAddrDistance'][s] >= 200 or df['cityToAgtDistance'][s] >= 200 or df['cityToDataDistance'][s] >= 200:
        x=x-2
    
    elif np.isnan(df['cityToDataDistance'][s]):
        x=x+0
         
    
    try:
        if df['dateDiff'][s] <= 15:
            x=x+5

        elif 15 < df['dateDiff'][s] <= 25:
            x=x+4
        
        elif 25 < df['dateDiff'][s] <= 30:
            x=x+3
            
        elif 30 < df['dateDiff'][s] <= 35:
            x=x+2
            
        elif df['dateDiff'][s] > 35:
            x=x+1

    except:
        print('cannot compare dates')
    
    totalScore.append(x)

In [4]:
df['totalScore'] = totalScore

display(df.info(),df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108609 entries, 0 to 108608
Data columns (total 31 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   ID                  108609 non-null  int64         
 1   assignee_id         108609 non-null  object        
 2   location_id         108609 non-null  object        
 3   organization        108609 non-null  object        
 4   city                108609 non-null  object        
 5   state               108609 non-null  object        
 6   city_latitude       108016 non-null  float64       
 7   city_longitude      108016 non-null  float64       
 8   dateOfFirstPat      108609 non-null  datetime64[ns]
 9   nameScores          108533 non-null  float64       
 10  matchNames          108533 non-null  object        
 11  subJurisCode        108609 non-null  object        
 12  minIncDate          108520 non-null  datetime64[ns]
 13  minIncDateLoc       108520 no

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,agent_longitude,data_city,data_state,data_latitude,data_longitude,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore
0,50000,fffffbe5-22f1-41de-9dad-66fe6a86a176,f54d6149-cb8e-11eb-9615-121df0c29c1e,Nlb Water,Denver,Co,39.7647,-104.955,2019-02-26,100.0,...,,Denver,Co,39.7647,-104.955,,,0.0,0.271233,25
1,50001,ffffed49-446e-4c9e-b195-f3aad9f49757,fe67264c-cb8f-11eb-9615-121df0c29c1e,Glucan Biorenewables,Saint Louis,Mo,38.6399,-90.5188,2013-12-20,100.0,...,,,,,,0.0,,,1.706849,25
2,0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,Close In Solutions,Austin,Tx,30.3076,-97.7126,2005-05-02,100.0,...,,Round Rock,Tx,30.5229,-97.6776,15.0,,15.0,4.906849,14
3,50003,fffe78c5-5266-4cdb-991b-b60ba440e04b,5991f79b-cb8e-11eb-9615-121df0c29c1e,Ethos International,Scottsburg,In,38.6878,-85.784,1997-01-30,100.0,...,,Salem,In,38.602,-86.0996,18.1,,18.1,0.846575,19
4,50005,fffe51a8-c1af-40f0-a68d-78a06f26d67d,de09fd42-cb8e-11eb-9615-121df0c29c1e,Polylactane,Milwaukee,Wi,43.0563,-87.9323,1988-11-01,100.0,...,,,,,,,,,1.890411,10


In [5]:
print(df['assignee_id'].nunique())
print(df['ID'].nunique())

108609
108609


In [6]:
### save file
res_folder = "../csvResults/"
a_file="OCResults0_274491_ScoredData.csv"
a_full=os.path.join(res_folder,a_file)
print(a_full)

# df.to_csv(a_full,index=False)

../csvResults/OCResults0_274491_ScoredData.csv


# Calculate Confidence Scores

In [7]:
df['confidenceScore']=((10-1)*((df['totalScore']-min(df['totalScore']))/
                               (max(df['totalScore'])-min(df['totalScore']))))+1

df['confidenceScore']=[round(num1, 2) for num1 in df['confidenceScore']]

display(df.info(),df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108609 entries, 0 to 108608
Data columns (total 32 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   ID                  108609 non-null  int64         
 1   assignee_id         108609 non-null  object        
 2   location_id         108609 non-null  object        
 3   organization        108609 non-null  object        
 4   city                108609 non-null  object        
 5   state               108609 non-null  object        
 6   city_latitude       108016 non-null  float64       
 7   city_longitude      108016 non-null  float64       
 8   dateOfFirstPat      108609 non-null  datetime64[ns]
 9   nameScores          108533 non-null  float64       
 10  matchNames          108533 non-null  object        
 11  subJurisCode        108609 non-null  object        
 12  minIncDate          108520 non-null  datetime64[ns]
 13  minIncDateLoc       108520 no

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,data_city,data_state,data_latitude,data_longitude,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore,confidenceScore
0,50000,fffffbe5-22f1-41de-9dad-66fe6a86a176,f54d6149-cb8e-11eb-9615-121df0c29c1e,Nlb Water,Denver,Co,39.7647,-104.955,2019-02-26,100.0,...,Denver,Co,39.7647,-104.955,,,0.0,0.271233,25,10.0
1,50001,ffffed49-446e-4c9e-b195-f3aad9f49757,fe67264c-cb8f-11eb-9615-121df0c29c1e,Glucan Biorenewables,Saint Louis,Mo,38.6399,-90.5188,2013-12-20,100.0,...,,,,,0.0,,,1.706849,25,10.0
2,0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,Close In Solutions,Austin,Tx,30.3076,-97.7126,2005-05-02,100.0,...,Round Rock,Tx,30.5229,-97.6776,15.0,,15.0,4.906849,14,7.32
3,50003,fffe78c5-5266-4cdb-991b-b60ba440e04b,5991f79b-cb8e-11eb-9615-121df0c29c1e,Ethos International,Scottsburg,In,38.6878,-85.784,1997-01-30,100.0,...,Salem,In,38.602,-86.0996,18.1,,18.1,0.846575,19,8.54
4,50005,fffe51a8-c1af-40f0-a68d-78a06f26d67d,de09fd42-cb8e-11eb-9615-121df0c29c1e,Polylactane,Milwaukee,Wi,43.0563,-87.9323,1988-11-01,100.0,...,,,,,,,,1.890411,10,6.35


In [8]:
col         = 'confidenceScore'
conditions  = [ (df[col] >= 1) & (df[col] < 2), (df[col] >= 2) & (df[col] < 3), (df[col] >= 3) & (df[col] < 4),
                (df[col] >= 4) & (df[col] < 5), (df[col] >= 5) & (df[col] < 6), (df[col] >= 6) & (df[col] < 7), 
                (df[col] >= 7) & (df[col] < 8), (df[col] >= 8) & (df[col] < 9), (df[col] >= 9) & (df[col] < 10),
                (df[col] == 10) ]
choices     = [ 1,2,3,4,5,6,7,8,9,10 ]
    
df["score"] = np.select(conditions, choices, default=np.nan)
# df.drop(columns=['confidenceScore'],inplace=True)
df1=df.sort_values(by=['ID']).reset_index(drop=True)

display(df1.info(),df1.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 108609 entries, 0 to 108608
Data columns (total 33 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   ID                  108609 non-null  int64         
 1   assignee_id         108609 non-null  object        
 2   location_id         108609 non-null  object        
 3   organization        108609 non-null  object        
 4   city                108609 non-null  object        
 5   state               108609 non-null  object        
 6   city_latitude       108016 non-null  float64       
 7   city_longitude      108016 non-null  float64       
 8   dateOfFirstPat      108609 non-null  datetime64[ns]
 9   nameScores          108533 non-null  float64       
 10  matchNames          108533 non-null  object        
 11  subJurisCode        108609 non-null  object        
 12  minIncDate          108520 non-null  datetime64[ns]
 13  minIncDateLoc       108520 no

None

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,city_latitude,city_longitude,dateOfFirstPat,nameScores,...,data_state,data_latitude,data_longitude,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore,confidenceScore,score
0,0,fffe9f1f-cb1c-49ab-b00f-6416e3e3a909,fd8b2b76-cb90-11eb-9615-121df0c29c1e,Close In Solutions,Austin,Tx,30.3076,-97.7126,2005-05-02,100.0,...,Tx,30.5229,-97.6776,15.0,,15.0,4.906849,14,7.32,7.0
1,3,fffe36bb-6dea-4a8b-8bf5-071cf893ceba,fe1cb1c3-cb8f-11eb-9615-121df0c29c1e,Valley Business Solutions,Huntsville,Al,34.7015,-86.5766,2019-03-21,100.0,...,Al,34.7015,-86.5766,,,0.0,11.389041,25,10.0,10.0
2,4,fffd9c21-3bb1-4471-b316-d172921e3f83,ec16f9be-cb90-11eb-9615-121df0c29c1e,Railias Holdings,San Diego,Ca,32.8247,-117.152,2019-10-16,100.0,...,Ca,32.8247,-117.152,,0.0,0.0,1.89863,25,10.0,10.0
3,6,fffa8520-f0c2-431f-a64a-e16f3af0a896,fc25b086-cb8f-11eb-9615-121df0c29c1e,Fairdale Orthodontic Company,Cincinnati,Oh,39.1366,-84.5135,1986-06-23,100.0,...,,,,,,,18.690411,9,6.11,6.0
4,7,fff83fa2-1d91-479a-93c7-a5835815c2f9,70867e08-cb8e-11eb-9615-121df0c29c1e,Compsci Resources,Alexandria,Va,38.8148,-77.0902,2009-08-10,100.0,...,,,,0.0,0.0,,11.909589,25,10.0,10.0


In [9]:
print(df1['assignee_id'].nunique())
print(df1['ID'].nunique())

108609
108609


In [10]:
df1['score'].value_counts().sort_index(ascending=False)

10.0    44706
9.0     15965
8.0     18190
7.0      8665
6.0     10866
5.0      7212
4.0      1460
3.0      1053
2.0       461
1.0        31
Name: score, dtype: int64

In [14]:
df2=df1.sort_values(by=['ID'],ascending=True).iloc[:,[0,1,2,3,4,5,8,9,10,11,12,13,14,15,18,19,
                                                      22,23,26,27,28,29,30,31,32]]
# df2.to_csv("../csvResults/OCResults0_274491_ScoredDataConfLevels.csv",index=False)

df3=df2.loc[df2['score'] > 1]
df4=df2.loc[df2['score'] == 1]
pd.concat([df3.groupby(by=['score']).sample(n=100,random_state=42),df4]).sort_values(by=['ID'],ascending=True)
#.to_csv("../csvResults/aOCResults0_274491_ScoredDataConfLevels.csv",index=False)

Unnamed: 0,ID,assignee_id,location_id,organization,city,state,dateOfFirstPat,nameScores,matchNames,subJurisCode,...,agent_state,data_city,data_state,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,dateDiff,totalScore,confidenceScore,score
31,48,ff9fcd18-8bc8-43ad-b17a-035c6a7bc2b0,fe46641e-cb8f-11eb-9615-121df0c29c1e,Cestusline,Portland,Or,2011-03-22,100.0,Cestusline,Or,...,,Portland,Or,,,0.0,0.706849,25,10.00,10.0
69,110,ff0aa675-cb5c-49c7-a6e7-f1a8cb6d0339,d60f4b76-09bd-11ec-893a-12de62d610b1,Human 2,Pembroke Pines,Fl,2008-04-24,45.0,Accord Human Resources 2,Fl,...,,Oklahoma City,Ok,,,1212.1,12.830137,-4,2.95,2.0
638,1084,f5ee9a92-4a51-41c5-bb1d-40c30ef63993,f0c45f9a-cb90-11eb-9615-121df0c29c1e,Newport Media,Lake Forest,Ca,2005-10-17,100.0,Newport Media,Ca,...,,Chandler,Az,337.8,,337.8,0.726027,8,5.86,5.0
719,1214,f4acc01c-000a-4fc7-a4fb-53eb19a13672,ffe7bef3-cb8e-11eb-9615-121df0c29c1e,Enterprise Partners Ii,La Jolla,Ca,1994-05-27,80.0,Enterprise Management Partners Ii,Ca,...,,,,64.1,,,4.517808,5,5.14,5.0
770,1300,f3c27218-1e83-4521-b676-5d321de0ac95,3d4d0beb-cb8e-11eb-9615-121df0c29c1e,Goshen Rubber Company,Goshen,In,1975-01-23,100.0,Goshen Rubber Company,In,...,,Cleveland,Oh,216.1,,216.1,46.106849,4,4.89,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
108518,273430,0dc8952c-bc33-4a82-bd65-b9583524ca04,f0a3aa63-cb90-11eb-9615-121df0c29c1e,Whirlpool,Benton Habor,Mi,1973-08-30,100.0,Whirlpool,Mi,...,Mi,East Lansing,Mi,53.4,53.4,53.1,17.978082,16,7.81,7.0
108522,273458,0d93fdc2-1ccf-4aa3-b787-f34fa52a76bd,ef8618f8-cb90-11eb-9615-121df0c29c1e,Union Carbide Chemicals And Plastics Technology,Wilmington,De,1971-12-01,100.0,Union Carbide Chemicals And Plastics Technology,De,...,,,,,,,16.986301,4,4.89,4.0
108531,273503,0bf96d08-fe99-4bbb-ae0d-52bc76f5b051,de09fd42-cb8e-11eb-9615-121df0c29c1e,Milwaukee Electric Tool,Milwaukee,Wi,1974-04-25,100.0,Milwaukee Electric Tool,Wi,...,,,,,,,47.084932,6,5.38,5.0
108544,273631,0a2a0a60-7f89-4ad6-975e-9d2d24fe54a9,41da5e01-cb8e-11eb-9615-121df0c29c1e,Chef'N,Seatlle,Wa,1989-03-30,75.0,The Chef'N,Wa,...,,Seattle,Wa,,,,4.123288,3,4.65,4.0


In [None]:
# import seaborn as sns

# df5_100=df5.loc[df5['nameScores']==100]

In [None]:
# sns.set(rc = {'figure.figsize':(10,8)})
# sns.set_style("white")

# sns.histplot(data=df5_100, x="totalScore")

In [None]:
# sns.set(rc = {'figure.figsize':(10,8)})
# sns.set_style("white")

# sns.histplot(data=df5_100, x="totalScore")