## The Dataset we will use in this task is a Sparse Dataset. We will find out the Cosine Similarity and different distance metrics.  

In [35]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cityblock, euclidean, minkowski
from sklearn.metrics.pairwise import cosine_similarity
from tabulate import tabulate 

In [36]:
df = pd.read_csv("dataset/world_happines_index.csv")

In [37]:
df.head()

Unnamed: 0,Country,Switzerland,Iceland,Denmark,Norway,Canada,Finland,Netherlands,Sweden,New Zealand,...,Chad,Guinea,Ivory Coast,Burkina Faso,Afghanistan,Rwanda,Benin,Syria,Burundi,Togo
0,Happiness_Score,7.587,7.561,7.527,7.522,7.427,7.406,7.378,7.364,7.286,...,3.667,3.656,3.655,3.587,3.575,3.465,3.34,3.006,2.905,2.839
1,Standard_Error,0.03411,0.04884,0.03328,0.0388,0.03553,0.0314,0.02799,0.03157,0.03371,...,0.0383,0.0359,0.05141,0.04324,0.03084,0.03464,0.03656,0.05015,0.08658,0.06727
2,GDP,1.39651,1.30232,1.32548,1.459,1.32629,1.29025,1.32944,1.33171,1.25018,...,0.34193,0.17417,0.46534,0.25812,0.31982,0.22208,0.28665,0.6632,0.0153,0.20868
3,Family,1.34951,1.40223,1.36058,1.33095,1.32261,1.31826,1.28017,1.28907,1.31967,...,0.76062,0.46475,0.77115,0.85188,0.30285,0.7737,0.35386,0.47489,0.41587,0.13995
4,Health,0.0,0.0,0.87464,0.0,0.90563,0.0,0.89284,0.91087,0.90837,...,0.1501,0.24009,0.15185,0.0,0.30335,0.42864,0.0,0.72193,0.22396,0.0


### Before moving further, let's check the null values.

In [38]:
df.isnull().sum()

Country        0
Switzerland    0
Iceland        0
Denmark        0
Norway         0
              ..
Rwanda         0
Benin          0
Syria          0
Burundi        0
Togo           0
Length: 159, dtype: int64

### There are no "Null" values in the dataset, but we can observe that, there are lot of 0s present in the dataset. It is better to replace zeros with NaN and after that counting them would be easier and 0s need to be replaced with suitable values.

In [39]:
df = df.replace(0,np.NaN)

### Now lets check again for Null values.

In [40]:
df.isnull().sum()

Country        0
Switzerland    1
Iceland        2
Denmark        0
Norway         2
              ..
Rwanda         0
Benin          2
Syria          0
Burundi        1
Togo           2
Length: 159, dtype: int64

### Now we will drop the rows with Null values

In [41]:
df = df.dropna()
df

Unnamed: 0,Country,Switzerland,Iceland,Denmark,Norway,Canada,Finland,Netherlands,Sweden,New Zealand,...,Chad,Guinea,Ivory Coast,Burkina Faso,Afghanistan,Rwanda,Benin,Syria,Burundi,Togo
0,Happiness_Score,7.587,7.561,7.527,7.522,7.427,7.406,7.378,7.364,7.286,...,3.667,3.656,3.655,3.587,3.575,3.465,3.34,3.006,2.905,2.839
1,Standard_Error,0.03411,0.04884,0.03328,0.0388,0.03553,0.0314,0.02799,0.03157,0.03371,...,0.0383,0.0359,0.05141,0.04324,0.03084,0.03464,0.03656,0.05015,0.08658,0.06727
2,GDP,1.39651,1.30232,1.32548,1.459,1.32629,1.29025,1.32944,1.33171,1.25018,...,0.34193,0.17417,0.46534,0.25812,0.31982,0.22208,0.28665,0.6632,0.0153,0.20868
3,Family,1.34951,1.40223,1.36058,1.33095,1.32261,1.31826,1.28017,1.28907,1.31967,...,0.76062,0.46475,0.77115,0.85188,0.30285,0.7737,0.35386,0.47489,0.41587,0.13995
5,Freedom,0.66557,0.62877,0.64938,0.66973,0.63297,0.64169,0.61576,0.6598,0.63938,...,0.23501,0.37725,0.46866,0.39493,0.23414,0.59201,0.4845,0.15684,0.1185,0.36453
7,Generosity,0.29678,0.4363,0.34139,0.34699,0.45811,0.23351,0.4761,0.36262,0.47501,...,0.18386,0.28657,0.20165,0.21747,0.3651,0.22628,0.1826,0.47179,0.19727,0.16681
8,Dystopia_Residual,2.51738,2.70201,2.49204,2.46531,2.45176,2.61955,2.4657,2.37119,2.26425,...,1.94296,1.99172,1.41723,1.46494,1.9521,0.67042,1.63328,0.32858,1.83302,1.56726


### Now then as the dataset is cleaned we can move ahead. This dataset is a index of happiness of all the contries. We can use similarity and distance metrics to see which countries are similar in terms of happiness. For Example we will work with Bangladesh and Mexico column to check the similarities. 



In [42]:
similiarty=cosine_similarity([df.Bangladesh],[df.Mexico])
x = df.Bangladesh
y = df.Mexico
manhattan = cityblock(x,y)
euclid = euclidean(x,y)
minko = minkowski(x,y, p=5)

In [43]:
similiarty=cosine_similarity([df.Bangladesh],[df.Mexico])

In [44]:
table = [("Cosine Similarity",similiarty),
                   ("Manhattan Distance",manhattan),
                   ("Euclidean Distance",euclid),
                   ("Minkowski Distance",minko),]
header = ["Metric", "Value"]
print(tabulate(table,headers=header,tablefmt="fancy_grid"))

╒════════════════════╤═════════╕
│ Metric             │   Value │
╞════════════════════╪═════════╡
│ Cosine Similarity  │ 0.99741 │
├────────────────────┼─────────┤
│ Manhattan Distance │ 4.84001 │
├────────────────────┼─────────┤
│ Euclidean Distance │ 2.8326  │
├────────────────────┼─────────┤
│ Minkowski Distance │ 2.50133 │
╘════════════════════╧═════════╛


### From the table we can see the cosine similarity between the two countries is quite strong. As well as the different distances between them is also close. 