## Scoring Algorithm

In [1]:
import pandas as pd
import numpy as np
import time
import os
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

### start timer
t0=time.time()

### set the path for the input file and save to variable
res_folder = "../csvResults/"
# input_file = "readyForScoring.csv"
# input_file = "readyForScoring2.csv"
input_file = "readyForScoring5.csv"

input_directory=os.path.join(res_folder,input_file)
print(input_directory,"\n")

### import the output file from OC API matching and select columns
df=pd.read_csv(input_directory)
df['incorporation_date'] = pd.to_datetime(df['incorporation_date'],errors='coerce')
df['dateFiledMin'] = pd.to_datetime(df['dateFiledMin'],errors='coerce')
df['record_date'] = pd.to_datetime(df['record_date'],errors='coerce')

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %.3f" % (total/60), "mins\n")
print("The number of unique assignee IDs are:",df.assignee_id.nunique())
print("The number of unique IDs are:",df.ID.nunique(),"\n")

### print general stats and first 5 records for dataset
display(df.info(null_counts=True),df.head())

../csvResults/readyForScoring5.csv 

Total time is 0.000 mins

The number of unique assignee IDs are: 462
The number of unique IDs are: 462 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 33 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  994 non-null    int64         
 1   assignee_id         994 non-null    object        
 2   location            994 non-null    object        
 3   organization        994 non-null    object        
 4   city                994 non-null    object        
 5   state               994 non-null    object        
 6   latitude            994 non-null    float64       
 7   longitude           994 non-null    float64       
 8   dateFiledMin        994 non-null    datetime64[ns]
 9   patent              994 non-null    object        
 10  assignee            907 non-null    object        
 11  assignor            

None

Unnamed: 0,ID,assignee_id,location,organization,city,state,latitude,longitude,dateFiledMin,patent,...,latitude_add,longitude_add,agent_city,agent_state,latitude_agt,longitude_agt,dateDiff,cityToAddrDistance,cityToAgtDistance,cityToDataDistance
0,933,012b6757-2d16-4c6d-8aa7-fefbe6328e1c,5d0d1602-16c8-11ed-9b5f-1234bde3cd05,Anemergonics,Arvada,Co,39.8006,-105.081,2008-02-08,8161698,...,,,,,,,2.69,,,
1,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,,,-0.403,,,
2,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,,,0.249,,,
3,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,,,-0.403,,,
4,16862,01db098e-3394-40a0-bb4c-350431b9685b,d96be5a2-16c7-11ed-9b5f-1234bde3cd05,Menogenix,Aurora,Co,39.7405,-104.831,2009-07-22,9216206,...,,,,,,,-0.907,,,1.9


In [2]:
### start timer
t0=time.time()

totalScore=[]
d=len(df)
x=''

### score the fuzzy match percentages

for s in range(d):
    ### scores all records with a fuzzy match score with a 100%; account for organization
    ### name lengths (i.e., shorter names are scored lower than longer names)
    
    if df['nameScores'][s] == 100:

        if len(df['organization'][s]) < 5:
            x=0
        
        elif len(df['organization'][s]) >= 5:
            x=5
    
    ### visual inspection of the data indicates a discrete group between 95% and 100%,
    ### resulting in the next set. Name lengths are again accounted for and shorter
    ### names/scores are downweighted even more
    
    elif 90 <= df['nameScores'][s] < 100:
        
        if len(df['organization'][s]) < 5:
            x=0
        
        elif 5 <= len(df['organization'][s]) < 10:
            x=1
            
        elif 10 <= len(df['organization'][s]) < 15:
            x=2
            
        elif len(df['organization'][s]) >= 15:
            x=5

    ### many of the fuzzy matches in this range are wrong, but there are a couple correct
    ### that should not be discounted. While the weights are not as high as the previous
    ### sections, correct matches will be given better scores than the next section
    
    elif 87 <= df['nameScores'][s] < 90:
        
        if len(df['organization'][s]) < 5:
            x=-1
        
        elif 5 <= len(df['organization'][s]) < 10:
            x=1
            
        elif 10 <= len(df['organization'][s]) < 15:
            x=2
            
        elif len(df['organization'][s]) >= 15:
            x=4

    ### very few are correct, but there are some misspellings that were not collected during
    ### the cleaning phase. Many of the records will be weighted down, but those that are
    ### longer in length will be given higher scores than names that are shorter
           
    elif df['nameScores'][s] < 87:
        x=-7
        
    elif np.isnan(df['nameScores'][s]):
        x=-10

    ### scoring the different features that contain state information for each
    ### organization. The jurisdictionScore feature was created by extracting
    ### the state from the jurisdiction_code field. The jurisdiction_code
    ### feature is the primary metric used to match patentsview and OC records
    ### and therefore, given a larger weight. The stateAddScore is given the
    ### second highest weight because it is the primary address that is listed
    ### in an OC record. stateAgtScore is given the least amount of weight
    ### because the agent may not always be located at the registered address 
    ### for the organization.
    
    if df['state'][s] == df['stateMatch'][s]:
        x=x+5
        
    elif df['state'][s] == df['address_state'][s]:
        x=x+5
        
    elif df['state'][s] == df['agent_state'][s]:
        x=x+5
        
    if df['city'][s] == df['cityMatch'][s]:
        x=x+5
        a=1
    
    elif df['city'][s] == df['address_city'][s]:
        x=x+5
        a=1
    
    elif df['city'][s] == df['agent_city'][s]:
        x=x+5
        a=1
        
    else:
        a=0
        
    ### the code below penalizes records with a first patent applied for date that is older
    ### than the incorporation date for that organization
    
    if df['dateFiledMin'][s] < df['incorporation_date'][s]:
        
        if df['dateDiff'][s] >= -5 and df['dateDiff'][s] < 0:
            x=x-1
            
        elif df['dateDiff'][s] >= -10 and df['dateDiff'][s] < -5:
            x=x-3
            
        elif df['dateDiff'][s] < -10:
            x=x-5
            
    ### the address_city feature is weighted more than the agent_city column for similar
    ### reasons stated in the states section above. Cities less than 4 characters long
    ### are penalized and gradually score better as the character length increase. Moreover,
    ### the score from fuzzy matching is used to create groups as shown below. Fuzzy
    ### scores below 90% are weighted negatively
    

    if a == 1:
        pass
    
    elif 0 <= df['cityToAddrDistance'][s] < 50 or 0 <= df['cityToAgtDistance'][s] < 50 or 0 <= df['cityToDataDistance'][s] < 50:
        x=x+5
        
    elif 50 <= df['cityToAddrDistance'][s] < 100 or 50 <= df['cityToAgtDistance'][s] < 100 or 50 <= df['cityToDataDistance'][s] < 100:
        x=x+2
        
    elif 100 <= df['cityToAddrDistance'][s] < 200 or 100 <= df['cityToAgtDistance'][s] < 200 or 100 <= df['cityToDataDistance'][s] < 200:
        x=x+1
    
    elif df['cityToAddrDistance'][s] >= 200 or df['cityToAgtDistance'][s] >= 200 or df['cityToDataDistance'][s] >= 200:
        x=x-2
    
    elif np.isnan(df['cityToDataDistance'][s]):
        x=x+0
         
    
    try:
        if df['dateDiff'][s] <= 15:
            x=x+5

        elif 15 < df['dateDiff'][s] <= 25:
            x=x+4
        
        elif 25 < df['dateDiff'][s] <= 30:
            x=x+3
            
        elif 30 < df['dateDiff'][s] <= 35:
            x=x+2
            
        elif df['dateDiff'][s] > 35:
            x=x+1

    except:
        print('cannot compare dates')
    
    totalScore.append(x)
    a=0
    
### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %.3f" % (total/60), "mins\n")

Total time is 0.001 mins



In [3]:
### start timer
t0=time.time()

df['totalScore'] = totalScore

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %.3f" % (total/60), "mins\n")
print("The number of unique assignee IDs are:",df.assignee.nunique())
print("The number of unique IDs are:",df.ID.nunique(),"\n")

### print general stats and first 5 records for dataset
display(df.info(),df.head())

Total time is 0.000 mins

The number of unique assignee IDs are: 626
The number of unique IDs are: 462 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 34 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  994 non-null    int64         
 1   assignee_id         994 non-null    object        
 2   location            994 non-null    object        
 3   organization        994 non-null    object        
 4   city                994 non-null    object        
 5   state               994 non-null    object        
 6   latitude            994 non-null    float64       
 7   longitude           994 non-null    float64       
 8   dateFiledMin        994 non-null    datetime64[ns]
 9   patent              994 non-null    object        
 10  assignee            907 non-null    object        
 11  assignor            907 non-null    object        
 12  r

None

Unnamed: 0,ID,assignee_id,location,organization,city,state,latitude,longitude,dateFiledMin,patent,...,longitude_add,agent_city,agent_state,latitude_agt,longitude_agt,dateDiff,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,totalScore
0,933,012b6757-2d16-4c6d-8aa7-fefbe6328e1c,5d0d1602-16c8-11ed-9b5f-1234bde3cd05,Anemergonics,Arvada,Co,39.8006,-105.081,2008-02-08,8161698,...,,,,,,2.69,,,,15
1,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,,-0.403,,,,14
2,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,,0.249,,,,15
3,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,,-0.403,,,,14
4,16862,01db098e-3394-40a0-bb4c-350431b9685b,d96be5a2-16c7-11ed-9b5f-1234bde3cd05,Menogenix,Aurora,Co,39.7405,-104.831,2009-07-22,9216206,...,,,,,,-0.907,,,1.9,19


In [4]:
# df.to_csv("../csvResults/scoredOCResults5.csv",index=False)

# Calculate Confidence Scores

In [5]:
### start timer
t0=time.time()

df['confidenceScore']=((10-1)*((df['totalScore']-min(df['totalScore']))/
                               (max(df['totalScore'])-min(df['totalScore']))))+1

df['confidenceScore']=[round(num1, 2) for num1 in df['confidenceScore']]

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %.3f" % (total/60), "mins\n")
print("The number of unique assignee IDs are:",df.assignee.nunique())
print("The number of unique IDs are:",df.ID.nunique(),"\n")

### print general stats and first 5 records for dataset
display(df.info(),df.head())

Total time is 0.000 mins

The number of unique assignee IDs are: 626
The number of unique IDs are: 462 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  994 non-null    int64         
 1   assignee_id         994 non-null    object        
 2   location            994 non-null    object        
 3   organization        994 non-null    object        
 4   city                994 non-null    object        
 5   state               994 non-null    object        
 6   latitude            994 non-null    float64       
 7   longitude           994 non-null    float64       
 8   dateFiledMin        994 non-null    datetime64[ns]
 9   patent              994 non-null    object        
 10  assignee            907 non-null    object        
 11  assignor            907 non-null    object        
 12  r

None

Unnamed: 0,ID,assignee_id,location,organization,city,state,latitude,longitude,dateFiledMin,patent,...,agent_city,agent_state,latitude_agt,longitude_agt,dateDiff,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,totalScore,confidenceScore
0,933,012b6757-2d16-4c6d-8aa7-fefbe6328e1c,5d0d1602-16c8-11ed-9b5f-1234bde3cd05,Anemergonics,Arvada,Co,39.8006,-105.081,2008-02-08,8161698,...,,,,,2.69,,,,15,7.86
1,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,-0.403,,,,14,7.43
2,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,0.249,,,,15,7.86
3,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,,-0.403,,,,14,7.43
4,16862,01db098e-3394-40a0-bb4c-350431b9685b,d96be5a2-16c7-11ed-9b5f-1234bde3cd05,Menogenix,Aurora,Co,39.7405,-104.831,2009-07-22,9216206,...,,,,,-0.907,,,1.9,19,9.57


In [6]:
### start timer
t0=time.time()

df['score'] = df['confidenceScore'].apply(np.floor)
df1=df.sort_values(by=['ID','score','dateDiff','record_date'],ascending=[True,False,False,False]).reset_index(drop=True)

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %.3f" % (total/60), "mins\n")

### print general stats and first 5 records for dataset
display(df1.info(),df1.head())

Total time is 0.000 mins

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 994 entries, 0 to 993
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  994 non-null    int64         
 1   assignee_id         994 non-null    object        
 2   location            994 non-null    object        
 3   organization        994 non-null    object        
 4   city                994 non-null    object        
 5   state               994 non-null    object        
 6   latitude            994 non-null    float64       
 7   longitude           994 non-null    float64       
 8   dateFiledMin        994 non-null    datetime64[ns]
 9   patent              994 non-null    object        
 10  assignee            907 non-null    object        
 11  assignor            907 non-null    object        
 12  record_date         907 non-null    datetime64[ns]
 13  nameScores          990 

None

Unnamed: 0,ID,assignee_id,location,organization,city,state,latitude,longitude,dateFiledMin,patent,...,agent_state,latitude_agt,longitude_agt,dateDiff,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,totalScore,confidenceScore,score
0,933,012b6757-2d16-4c6d-8aa7-fefbe6328e1c,5d0d1602-16c8-11ed-9b5f-1234bde3cd05,Anemergonics,Arvada,Co,39.8006,-105.081,2008-02-08,8161698,...,,,,2.69,,,,15,7.86,7.0
1,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,0.249,,,,15,7.86,7.0
2,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,-0.403,,,,14,7.43,7.0
3,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,-0.403,,,,14,7.43,7.0
4,16862,01db098e-3394-40a0-bb4c-350431b9685b,d96be5a2-16c7-11ed-9b5f-1234bde3cd05,Menogenix,Aurora,Co,39.7405,-104.831,2009-07-22,9216206,...,,,,5.386,,,1.9,20,10.0,10.0


In [7]:
### start timer
t0=time.time()

df2=df1.drop_duplicates(subset=['ID'],keep='first')

### end timer and print total time
t1=time.time()
total=t1-t0
print("Total time is %.3f" % (total/60), "mins\n")
print("The number of unique assignee IDs are:",df2.assignee_id.nunique())
print("The number of unique IDs are:",df2.ID.nunique(),"\n")

### print general stats and first 5 records for dataset
display(df2.info(),df2.head())

Total time is 0.000 mins

The number of unique assignee IDs are: 462
The number of unique IDs are: 462 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 462 entries, 0 to 992
Data columns (total 36 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   ID                  462 non-null    int64         
 1   assignee_id         462 non-null    object        
 2   location            462 non-null    object        
 3   organization        462 non-null    object        
 4   city                462 non-null    object        
 5   state               462 non-null    object        
 6   latitude            462 non-null    float64       
 7   longitude           462 non-null    float64       
 8   dateFiledMin        462 non-null    datetime64[ns]
 9   patent              462 non-null    object        
 10  assignee            395 non-null    object        
 11  assignor            395 non-null    object        
 12  r

None

Unnamed: 0,ID,assignee_id,location,organization,city,state,latitude,longitude,dateFiledMin,patent,...,agent_state,latitude_agt,longitude_agt,dateDiff,cityToAddrDistance,cityToAgtDistance,cityToDataDistance,totalScore,confidenceScore,score
0,933,012b6757-2d16-4c6d-8aa7-fefbe6328e1c,5d0d1602-16c8-11ed-9b5f-1234bde3cd05,Anemergonics,Arvada,Co,39.8006,-105.081,2008-02-08,8161698,...,,,,2.69,,,,15,7.86,7.0
1,1189,0179132f-48af-4c06-aa5c-f6ef9a011372,cf479b1d-16c7-11ed-9b5f-1234bde3cd05,Mirofam,Thornton,Co,39.8696,-104.985,2008-03-28,7841821,...,,,,0.249,,,,15,7.86,7.0
4,16862,01db098e-3394-40a0-bb4c-350431b9685b,d96be5a2-16c7-11ed-9b5f-1234bde3cd05,Menogenix,Aurora,Co,39.7405,-104.831,2009-07-22,9216206,...,,,,5.386,,,1.9,20,10.0,10.0
7,16867,01dd4861-b537-4f33-a995-b8dcc1effa69,c1ad42d2-16c7-11ed-9b5f-1234bde3cd05,Moxion Power,Mill Valley,Ca,37.906,-122.545,2021-06-01,11283273,...,Ca,37.7292,-123.047,1.258,0.2,30.1,0.2,20,10.0,10.0
8,16936,01ef4c0c-2ff6-4c09-8781-385a68ff3220,17f7f348-16c8-11ed-9b5f-1234bde3cd05,Swicherz,Santa Monica,Ca,34.0195,-118.491,2003-01-31,7147399,...,,,,3.019,11.7,,,20,10.0,10.0


In [8]:
# df2.to_csv("../csvResults/reviewScoredResults5.csv",index=False)

In [9]:
### +/- 5 yrs
# df2['score'].value_counts().sort_index(ascending=False)

10.0    239
9.0      40
8.0      21
7.0     121
6.0       9
4.0      16
3.0       5
2.0      10
1.0       1
Name: score, dtype: int64

In [8]:
### +/- 5 yrs
# df2['score'].value_counts().sort_index(ascending=False)

10.0    229
9.0      50
8.0      21
7.0     121
6.0       9
4.0      16
3.0       5
2.0      10
1.0       1
Name: score, dtype: int64

In [8]:
### +/- 2 yrs
# df2['score'].value_counts().sort_index(ascending=False)

10.0    229
9.0      38
8.0      19
7.0     115
6.0      10
4.0      14
3.0       6
2.0      10
1.0       1
Name: score, dtype: int64

In [8]:
### og file
# df2['score'].value_counts().sort_index(ascending=False)

10.0    223
9.0      25
8.0      18
7.0     104
6.0      10
5.0       1
4.0      12
3.0       4
2.0       3
Name: score, dtype: int64

In [8]:
df2.groupby(by=['score'])['match_num'].value_counts().sort_index(ascending=False)

score  match_num
10.0   2.0           20
       1.0          209
9.0    2.0            2
       1.0           48
8.0    2.0            3
       1.0           18
7.0    2.0           14
       1.0          107
6.0    1.0            9
4.0    2.0            1
       1.0           15
3.0    2.0            1
       1.0            4
2.0    2.0            1
       1.0            9
1.0    2.0            1
Name: match_num, dtype: int64

In [15]:
df2.groupby(by=['score'])['match_num'].value_counts().sort_index(ascending=False)

score  match_num
10.0   2.0           121
       1.0          1821
9.0    2.0            22
       1.0           133
8.0    2.0            11
       1.0           147
7.0    2.0            52
       1.0           718
6.0    2.0             6
       1.0            50
5.0    2.0            16
       1.0            88
4.0    2.0             3
       1.0            10
3.0    2.0            22
       1.0            73
2.0    2.0             6
       1.0            24
1.0    2.0             1
       1.0             6
Name: match_num, dtype: int64

In [9]:
df2['score'].value_counts().sort_index(ascending=False)

10.0    3705
9.0      291
8.0     1515
7.0      311
6.0      105
5.0      189
4.0       58
3.0      123
2.0       55
1.0        7
Name: score, dtype: int64

In [9]:
df2['score'].value_counts().sort_index(ascending=False)

10.0    3689
9.0      280
8.0     1497
7.0      310
6.0      101
5.0      203
4.0       71
3.0      135
2.0       63
1.0       10
Name: score, dtype: int64

In [8]:
df2['score'].value_counts().sort_index(ascending=False)

10.0    3663
9.0      294
8.0     1491
7.0      310
6.0      116
5.0      206
4.0       71
3.0      135
2.0       63
1.0       10
Name: score, dtype: int64

In [8]:
df2['score'].value_counts().sort_index(ascending=False)

10.0    3574
9.0      408
8.0      690
7.0      141
6.0     1187
5.0      144
4.0       76
3.0       33
2.0       96
1.0       10
Name: score, dtype: int64

In [9]:
df2['score'].value_counts().sort_index(ascending=False)

10.0    2709
9.0      267
8.0     1009
7.0      682
6.0      308
5.0     1104
4.0      138
3.0       34
2.0       98
1.0       10
Name: score, dtype: int64

In [8]:
df2['score'].value_counts().sort_index(ascending=False)

10.0    2640
9.0      270
8.0      966
7.0      674
6.0      364
5.0     1099
4.0      147
3.0       40
2.0      137
1.0       22
Name: score, dtype: int64

In [10]:
df2['score'].value_counts().sort_index(ascending=False)

10.0    2378
9.0      408
8.0      896
7.0      757
6.0      383
5.0     1152
4.0      164
3.0       43
2.0      156
1.0       22
Name: score, dtype: int64

In [9]:
df2['score'].value_counts().sort_index(ascending=False)

10.0    2105
9.0      590
8.0      912
7.0      749
6.0     1162
5.0      396
4.0      160
3.0      107
2.0      156
1.0       22
Name: score, dtype: int64

In [14]:
# df2['score'].value_counts().sort_index(ascending=False)

10.0    2178
9.0      714
8.0      921
7.0      847
6.0      737
5.0     1642
4.0      959
3.0      498
2.0      880
1.0      213
Name: score, dtype: int64

In [None]:
# import seaborn as sns

# df5_100=df5.loc[df5['nameScores']==100]

In [None]:
# sns.set(rc = {'figure.figsize':(10,8)})
# sns.set_style("white")

# sns.histplot(data=df5_100, x="totalScore")

In [None]:
# sns.set(rc = {'figure.figsize':(10,8)})
# sns.set_style("white")

# sns.histplot(data=df5_100, x="totalScore")