In [1]:
#import libs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import math

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.preprocessing import Normalizer, normalize

In [2]:
#Drivers data is used for this implementation

driver_df = pd.read_csv('data_1024.csv', delimiter='\t')

In [3]:
driver_df.head(20)

Unnamed: 0,Driver_ID,Distance_Feature,Speeding_Feature
0,3423311935,71.24,28.0
1,3423313212,52.53,25.0
2,3423313724,64.54,27.0
3,3423311373,55.69,22.0
4,3423310999,54.58,25.0
5,3423313857,41.91,10.0
6,3423312432,58.64,20.0
7,3423311434,52.02,8.0
8,3423311328,31.25,34.0
9,3423312488,44.31,19.0


In [4]:
# Select Speeding_Feature and Distance_feature columns from the original dataset

driver_df = driver_df[['Distance_Feature', 'Speeding_Feature']]

In [5]:
# I will use describe (inbuild function) to provide statistical insights of data

driver_df.describe()

Unnamed: 0,Distance_Feature,Speeding_Feature
count,4000.0,4000.0
mean,76.041522,10.721
std,53.469563,13.708543
min,15.52,0.0
25%,45.2475,4.0
50%,53.33,6.0
75%,65.6325,9.0
max,244.79,100.0


In [6]:
#Before applying numeric features, convert data type to float

driver_df = driver_df.astype(np.float32)
driver_df.dtypes

Distance_Feature    float32
Speeding_Feature    float32
dtype: object

In [7]:
#Function to calculate Cosine Similarity
#d1 and d2 are input vectors

def cal_cosine_similarity(d1, d2):
    x1 = d1[0]
    y1 = d1[1]
    
    x2 = d2[0]
    y2 = d2[1]
    
    magnitude = math.sqrt(x1*x1 + y1*y1) * math.sqrt(x2*x2 + y2*y2)
    dot_product = x1*x2 + y1*y2
    
    return dot_product / magnitude

In [8]:
# lets select three random drivers from the dataset and calculate cosine similarity
# selected from location 0, 1 and 8

d1 = driver_df.loc[0]
d2 = driver_df.loc[1]
d3 = driver_df.loc[8]

In [9]:
# Now check how similar d1 and d2 are using cosine similarity
cal_cosine_similarity(d1, d2)

0.9975714725306971

In [None]:
# d1 and d2 are very similar with cosine similarity of .99

In [10]:
# Now check how similar d1 and d3)

cal_cosine_similarity(d1, d3)

0.8991255469171173

In [13]:
'''
Normalize the whole dataframe using sklearn lib
Using L2 normalize
L2 norm of each row is equal to 1
math.sqrt(x2 + y2) = 1

'''

normalized_df = pd.DataFrame(normalize(driver_df, norm='l2'), columns = driver_df.columns)

In [14]:
normalized_df.head()

Unnamed: 0,Distance_Feature,Speeding_Feature
0,0.930694,0.365798
1,0.902956,0.429733
2,0.922526,0.385934
3,0.930058,0.367414
4,0.909165,0.416437


In [15]:
#calculate the magnitude to verify that after applying L2 norm, the value of magnitude is 1

normalized_df['Magnitude'] = normalized_df['Distance_Feature'] * normalized_df['Distance_Feature'] + \
                    normalized_df['Speeding_Feature'] * normalized_df['Speeding_Feature']

In [16]:
#Magnitude of each row is equal to one
normalized_df.head(10)

Unnamed: 0,Distance_Feature,Speeding_Feature,Magnitude
0,0.930694,0.365798,1.0
1,0.902956,0.429733,1.0
2,0.922526,0.385934,1.0
3,0.930058,0.367414,1.0
4,0.909165,0.416437,1.0
5,0.972694,0.232091,1.0
6,0.946465,0.322805,1.0
7,0.98838,0.152,1.0
8,0.676705,0.736255,1.0
9,0.91907,0.394095,1.0
