In [1]:
# Installing Libraries (if not installed)
#!pip3 install matplotlib
#!pip3 install scikit-learn
#!pip3 install seaborn
#!pip3 install numpy
#!pip3 install pandas

# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
df_1 = pd.read_csv('data/dataset_1.csv', sep=';')

df_1.head() # Show first 5 rows
df_1


Unnamed: 0,Date,exercise_hours,stress_level
0,2013-01-01,1,4
1,2013-02-03,2,2
2,2013-03-05,3,3
3,2013-04-07,1,2
4,2013-05-09,3,3
5,2013-06-10,1,4
6,2013-07-12,2,2
7,2013-08-15,3,3
8,2013-09-16,1,2
9,2013-10-20,3,3


In [11]:
df_2 = pd.read_csv('data/rain_temp.csv', sep=';')

df_2.head() # Show first 5 rows
df_2

Unnamed: 0,Date,rain_mm,temperature_cel
0,1-2013,45.4013,25.2206
1,2-2013,22.9427,26.3958
2,3-2013,153.986,27.4341
3,4-2013,149.26,24.6741
4,5-2013,69.8086,25.7978
5,6-2013,27.181,23.214
6,7-2013,27.0799,23.8115
7,8-2013,58.4273,23.5112
8,9-2013,46.3291,25.3242
9,10-2013,44.4752,24.9766


## Idea for estimating similar data sets
We take a sample of the data sets (quantiles). We make the assumption that for similar data sets the difference between their values should be almost equal (since this is what determines the number of gradual patterns that can be generated). A data set with many differences yields many gradual patterns.

In [48]:
# Calculate Quantiles
quo_1 = df_1.exercise_hours.quantile([0,0.25,0.5,0.75,1])
quo_2 = df_2.rain_mm.quantile([0,0.25,0.5,0.75,1])

# Retrieve indices
def idxquantile(s, q=0.5, *args, **kwargs):
    qv = s.quantile(q, *args, **kwargs)
    return (s.sort_values()[::-1] <= qv).idxmax()
#idxquantile(s)

print("Actual Differences")
print(quo_1)
print("\n")
print(quo_2)


# Nomalize the quotients between values 0 to 1
quo_1 = quo_1 / np.max(quo_1)
quo_2 = quo_2 / np.max(quo_2)

print("\nNormalized Differences")
print(quo_1)
print("\n")
print(quo_2)

Actual Differences
0.00    1.0
0.25    1.0
0.50    2.0
0.75    3.0
1.00    3.0
Name: exercise_hours, dtype: float64


0.00     22.942700
0.25     27.922900
0.50     51.701800
0.75     75.973425
1.00    153.986000
Name: rain_mm, dtype: float64

Normalized Differences
0.00    0.333333
0.25    0.333333
0.50    0.666667
0.75    1.000000
1.00    1.000000
Name: exercise_hours, dtype: float64


0.00    0.148992
0.25    0.181334
0.50    0.335756
0.75    0.493379
1.00    1.000000
Name: rain_mm, dtype: float64


In [47]:
# Subtract subsequent values
sub_1 = np.diff(quo_1[::-1])[::-1]
sub_2 = np.diff(quo_2[::-1])[::-1]

sub1 = abs(sub_1)
sub2 = abs(sub_2)

print(sub1)
print(sub2)

[0.         0.33333333 0.33333333 0.        ]
[0.0323419  0.15442248 0.15762228 0.50662122]


## Computing similarity
Adopted from [Neo4J Docs](https://neo4j.com/docs/graph-data-science/current/algorithms/knn/)

* *We observe (below) that **Cosine similarity** and **Pearson's similarity** give the best results.*

### 1. Scalar numeric property
When the property is a scalar number, the similarity is computed as one divided by one plus the absolute difference between the values:

![scalar similarity](data/knn-scalar-similarity.svg)


In [49]:
# Compute scalar similarity (NOT RELEVANT)
sca_sim = 1 / (1 + abs(np.subtract(sub1, sub2)))
sca_sim

array([0.96867133, 0.84824056, 0.85054912, 0.66373684])

### 2. Cosine similarity
Cosine similarity is computed using the following formula:

![cosine similarity](data/cosine-similarity.png)

Values range between -1 and 1, where -1 is perfectly dissimilar and 1 is perfectly similar.


In [41]:
# Compute cosine similarity (MOST RELEVANT)
from numpy import dot
from numpy.linalg import norm

cos_sim = dot(sub1, sub_2)/(norm(sub1)*norm(sub_2))
cos_sim

-0.3986171697030636

### 3. Pearson's similarity
Pearson similarity is computed using the following formula:

![pearson's similarity](data/pearson-similarity.png)

Values range between -1 and 1, where -1 is perfectly dissimilar and 1 is perfectly similar.



In [42]:
# Compute pearson's similarity
pea_sim = np.corrcoef(sub1, sub2)
print(pea_sim)
print("\n")
pea_sim[0][1]

[[ 1.         -0.32046455]
 [-0.32046455  1.        ]]




-0.32046454903429467

### 4. Euclidean similarity

Euclidean distance is computed using the following formula:

![euclidean's similarity](data/euclidean.png)

In [43]:
# Compute Euclidean similarity
from scipy.spatial import distance

euc_sim = distance.euclidean(sub1, sub2)
euc_sim

0.5662106712964972