# 5. Find Cosine Similarity

## Load Files

In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

import pickle
import time

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
with open ('../pickles/df.pkl', 'rb') as pkl:
    df = pickle.load(pkl)

In [3]:
with open ('../pickles/ben_cols.pkl', 'rb') as pkl:
    ben_cols = pickle.load(pkl)
    
with open ('../pickles/ben_cols_dum.pkl', 'rb') as pkl:
    ben_cols_dum = pickle.load(pkl)

In [4]:
with open ('../pickles/rate_cols.pkl', 'rb') as pkl:
    rate_cols = pickle.load(pkl)

In [5]:
list(df.columns)

['BusinessYear',
 'StateCode',
 'RateEffectiveDate',
 'RateExpirationDate',
 'PlanId',
 'RatingAreaId',
 'Tobacco',
 'Age',
 'IndividualRate',
 'IndividualTobaccoRate',
 'Couple',
 'PrimarySubscriberAndOneDependent',
 'PrimarySubscriberAndTwoDependents',
 'PrimarySubscriberAndThreeOrMoreDependents',
 'CoupleAndOneDependent',
 'CoupleAndTwoDependents',
 'CoupleAndThreeOrMoreDependents',
 'RowNumber',
 'Dental Care, Basic - Child',
 'Dental Care, Major - Child',
 'Orthodontia - Child',
 'Dental, Accidental - Adult',
 'Dental Care, Basic - Adult',
 'Dental Care, Major - Adult',
 'Dental Care, Routine - Adult',
 'Orthodontia - Adult',
 'Delivery and All Inpatient Services for Maternity Care',
 'Durable Medical Equipment',
 'Emergency Room Services',
 'Emergency Transportation/Ambulance',
 'Eyeglasses - Child',
 'Drugs, Generic',
 'Home Health Care Services',
 'Hospice Services',
 'Imaging (CT/PET Scans, MRIs)',
 'Inpatient Hospital Services (e.g., Hospital Stay)',
 'Inpatient Physician and

## Add a unique key to each plan, called `'ben_key'`

In [173]:
df['ben_key'] = df.groupby(ben_cols_dum).ngroup()

In [395]:
ben_key_cols_dum = ben_cols_dum

In [396]:
ben_key_cols_dum.append('ben_key')

Drop duplicate rows (based on the benefits columns `ben_cols_dum`) to view the vectors that are unique.

In [367]:
no_dupes = df[ben_key_cols_dum].set_index('ben_key').drop_duplicates()

In [368]:
no_dupes.head(3)

Unnamed: 0_level_0,"Dental Care, Basic - Child","Dental Care, Major - Child",Orthodontia - Child,"Dental, Accidental - Adult","Dental Care, Basic - Adult","Dental Care, Major - Adult","Dental Care, Routine - Adult",Orthodontia - Adult,Delivery and All Inpatient Services for Maternity Care,Durable Medical Equipment,...,Endodontics - Adult,Habilitation - Acquired Brain Injury,Dental Cleanings - Adult,Surgical Extraction - Adult,Surgical Extraction - Child,Cosmetic Orthodontia,"Renal Dialysis, End Stage",Post-cochlear implant aural therapy,X-Rays and Exams - Adult,"Dental Care, Minor - Adult"
ben_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21,1,1,1,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
174,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
17,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [120]:
no_dupes.shape

(215, 207)

## Find cosine similarity using `ben_cols_dum`

To analyze the resulting dataframe of unique rows, let's look at the cosine similarity between rows:

In [333]:
cos_mat = cosine_similarity(no_dupes, no_dupes)

In [198]:
cos_df = pd.DataFrame(cos_mat, columns=no_dupes.index, index=no_dupes.index)

In [324]:
cos_df.head(3)

ben_key,21,174,17,98,94,88,90,91,89,101,...,47,52,48,53,161,93,50,162,45,109
ben_key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
21,1.0,0.284747,0.707107,0.679366,0.866025,0.283069,0.277778,0.272772,0.277778,0.489898,...,0.165145,0.163663,0.163663,0.162221,0.294884,0.866025,0.165145,0.29277,0.165145,0.313625
174,0.284747,1.0,0.201347,0.25793,0.328798,0.789912,0.806784,0.823315,0.806784,0.185996,...,0.783741,0.792246,0.792246,0.800664,0.895653,0.287698,0.783741,0.889232,0.783741,0.833503
17,0.707107,0.201347,1.0,0.480384,0.612372,0.240192,0.235702,0.231455,0.235702,0.34641,...,0.23355,0.231455,0.231455,0.229416,0.208514,0.612372,0.23355,0.20702,0.23355,0.221766


The benefit key (`'ben_key'`) of each row can be obtained using the `.name()` function.

In [325]:
cos_df.iloc[2,:].name

17

The below returns the `'ben_key'`s that are most similar to vector created by the user's input:

In [341]:
my_plan = cos_df.iloc[2,:].sort_values(ascending=False).head(11)[1:]

In [342]:
print("Ben_key #s most similar to ben_key #", cos_df.iloc[2,:].name, ":", list(my_plan.index))

Ben_key #s most similar to ben_key # 17 : [26, 18, 19, 27, 20, 84, 21, 10, 92, 22]


And, we can look up that ben_key in our original dataframe.
However, we want to look up these values in a filtered dataframe.

In [343]:
df[df['ben_key'] == my_plan.index[0]].head(3)

Unnamed: 0,BusinessYear,StateCode,RateEffectiveDate,RateExpirationDate,PlanId,RatingAreaId,Tobacco,Age,IndividualRate,IndividualTobaccoRate,...,X-Rays and Exams - Adult,"Dental Care, Minor - Adult",IsNoticeRequiredForPregnancy,IsReferralRequiredForSpecialist,ChildOnlyOffering,WellnessProgramOffered,DiseaseManagementProgramsOffered,OutOfCountryCoverage,NationalNetwork,ben_key
122464,2016,AZ,2016-01-01,2016-12-31,86830AZ0050001,Rating Area 1,No Preference,0-20,31.0,,...,0,0,,,Allows Child-Only,,,Yes,Yes,26
122465,2016,AZ,2016-01-01,2016-12-31,86830AZ0050001,Rating Area 1,No Preference,21,9999.0,,...,0,0,,,Allows Child-Only,,,Yes,Yes,26
122466,2016,AZ,2016-01-01,2016-12-31,86830AZ0050001,Rating Area 1,No Preference,22,9999.0,,...,0,0,,,Allows Child-Only,,,Yes,Yes,26


## Filter dataframe

In [354]:
df[(df['Age'] == '40') & (df['StateCode'] == 'AZ' )].head(3)

Unnamed: 0,BusinessYear,StateCode,RateEffectiveDate,RateExpirationDate,PlanId,RatingAreaId,Tobacco,Age,IndividualRate,IndividualTobaccoRate,...,X-Rays and Exams - Adult,"Dental Care, Minor - Adult",IsNoticeRequiredForPregnancy,IsReferralRequiredForSpecialist,ChildOnlyOffering,WellnessProgramOffered,DiseaseManagementProgramsOffered,OutOfCountryCoverage,NationalNetwork,ben_key
77846,2016,AZ,2016-01-01,2016-12-31,12303AZ0010001,Rating Area 1,No Preference,40,0.0,,...,0,0,,,Allows Child-Only,,,No,No,94
77892,2016,AZ,2016-01-01,2016-12-31,12303AZ0010001,Rating Area 2,No Preference,40,0.0,,...,0,0,,,Allows Child-Only,,,No,No,94
77938,2016,AZ,2016-01-01,2016-12-31,12303AZ0010001,Rating Area 3,No Preference,40,0.0,,...,0,0,,,Allows Child-Only,,,No,No,94


## Generate a unique vector

In [397]:
customplan = pd.DataFrame(data=0, columns=[col for col in ben_key_cols_dum if col != 'ben_key'], index=['customplan'])

In [398]:
customplan.columns

Index(['Dental Care, Basic - Child', 'Dental Care, Major - Child',
       'Orthodontia - Child', 'Dental, Accidental - Adult',
       'Dental Care, Basic - Adult', 'Dental Care, Major - Adult',
       'Dental Care, Routine - Adult', 'Orthodontia - Adult',
       'Delivery and All Inpatient Services for Maternity Care',
       'Durable Medical Equipment',
       ...
       'Endodontics - Adult', 'Habilitation - Acquired Brain Injury',
       'Dental Cleanings - Adult', 'Surgical Extraction - Adult',
       'Surgical Extraction - Child', 'Cosmetic Orthodontia',
       'Renal Dialysis, End Stage', 'Post-cochlear implant aural therapy',
       'X-Rays and Exams - Adult', 'Dental Care, Minor - Adult'],
      dtype='object', length=207)

In [399]:
customplan['Dental Care, Basic - Child'] = 1

In [400]:
customplan.head()

Unnamed: 0,"Dental Care, Basic - Child","Dental Care, Major - Child",Orthodontia - Child,"Dental, Accidental - Adult","Dental Care, Basic - Adult","Dental Care, Major - Adult","Dental Care, Routine - Adult",Orthodontia - Adult,Delivery and All Inpatient Services for Maternity Care,Durable Medical Equipment,...,Endodontics - Adult,Habilitation - Acquired Brain Injury,Dental Cleanings - Adult,Surgical Extraction - Adult,Surgical Extraction - Child,Cosmetic Orthodontia,"Renal Dialysis, End Stage",Post-cochlear implant aural therapy,X-Rays and Exams - Adult,"Dental Care, Minor - Adult"
customplan,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [401]:
no_dupes.shape, customplan.shape

((215, 207), (1, 207))

In [419]:
custom_cos_mat = cosine_similarity(customplan, no_dupes)

In [420]:
custom_cos_df = pd.DataFrame(custom_cos_mat, columns=no_dupes.index, index=customplan.index)

In [421]:
custom_cos_df

ben_key,21,174,17,98,94,88,90,91,89,101,...,47,52,48,53,161,93,50,162,45,109
customplan,0.408248,0.116248,0.57735,0.27735,0.353553,0.138675,0.136083,0.133631,0.136083,0.2,...,0.13484,0.133631,0.133631,0.132453,0.120386,0.353553,0.13484,0.119523,0.13484,0.128037


In [434]:
custom_my_plan = custom_cos_df.iloc[0,:].sort_values(ascending=False).head(11)[1:]

In [437]:
print("Ben_keys most similar to", custom_cos_df.iloc[0,:].name, ":", list(custom_my_plan.index))

Ben_keys most similar to customplan : [17, 3, 10, 26, 18, 27, 19, 9, 21, 84]


In [440]:
df[df['ben_key'] == custom_my_plan.index[0]].head(3)

Unnamed: 0,BusinessYear,StateCode,RateEffectiveDate,RateExpirationDate,PlanId,RatingAreaId,Tobacco,Age,IndividualRate,IndividualTobaccoRate,...,X-Rays and Exams - Adult,"Dental Care, Minor - Adult",IsNoticeRequiredForPregnancy,IsReferralRequiredForSpecialist,ChildOnlyOffering,WellnessProgramOffered,DiseaseManagementProgramsOffered,OutOfCountryCoverage,NationalNetwork,ben_key
638,2016,AK,2016-01-01,2016-03-31,21989AK0110001,Rating Area 1,No Preference,Family Option,0.0,,...,0,0,,,Allows Child-Only,,,No,Yes,17
639,2016,AK,2016-04-01,2016-06-30,21989AK0110001,Rating Area 1,No Preference,Family Option,0.0,,...,0,0,,,Allows Child-Only,,,No,Yes,17
640,2016,AK,2016-07-01,2016-09-30,21989AK0110001,Rating Area 1,No Preference,Family Option,0.0,,...,0,0,,,Allows Child-Only,,,No,Yes,17
