In [29]:
import pandas as pd
import numpy as np
import os
import calendar
from pathlib import Path # to interact with file system.
import networkx as nx
from pandas.plotting import scatter_matrix, parallel_coordinates
import seaborn as sns # Making statistical graphs on top of what matplot can do 
from sklearn import preprocessing
from sklearn.metrics import jaccard_score
import matplotlib.pylab as plt # %matplotlib inline renders the figure in a notebook

In [3]:
df = pd.read_csv("dataBAN620-finalproj.csv")
df.head()

Unnamed: 0,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,IsLatestYear,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,nMx - age-specific death rate between ages x a...,numeric,EUR,Europe,Country,DNK,Denmark,Year,2019,True,...,,,,,,0,,,EN,2020-12-06T08:00:00.000Z
1,nMx - age-specific death rate between ages x a...,numeric,EUR,Europe,Country,ISL,Iceland,Year,2019,True,...,,,,,,0,,,EN,2020-12-06T08:00:00.000Z
2,nMx - age-specific death rate between ages x a...,numeric,EUR,Europe,Country,ISL,Iceland,Year,2019,True,...,,,,,,0,,,EN,2020-12-06T08:00:00.000Z
3,nMx - age-specific death rate between ages x a...,numeric,EUR,Europe,Country,ISR,Israel,Year,2019,True,...,,,,,,0,,,EN,2020-12-06T08:00:00.000Z
4,nMx - age-specific death rate between ages x a...,numeric,EUR,Europe,Country,LUX,Luxembourg,Year,2019,True,...,,,,,,0,,,EN,2020-12-06T08:00:00.000Z


In [4]:
df.columns

Index(['Indicator', 'ValueType', 'ParentLocationCode', 'ParentLocation',
       'Location type', 'SpatialDimValueCode', 'Location', 'Period type',
       'Period', 'IsLatestYear', 'Dim1 type', 'Dim1', 'Dim1ValueCode',
       'Dim2 type', 'Dim2', 'Dim2ValueCode', 'Dim3 type', 'Dim3',
       'Dim3ValueCode', 'DataSourceDimValueCode', 'DataSource',
       'FactValueNumericPrefix', 'FactValueNumeric', 'FactValueUoM',
       'FactValueNumericLowPrefix', 'FactValueNumericLow',
       'FactValueNumericHighPrefix', 'FactValueNumericHigh', 'Value',
       'FactValueTranslationID', 'FactComments', 'Language', 'DateModified'],
      dtype='object')

In [5]:
df= df[['Indicator','ParentLocationCode','SpatialDimValueCode', 'Location','Period','Dim1','Dim2','FactValueNumeric']]
df.columns

Index(['Indicator', 'ParentLocationCode', 'SpatialDimValueCode', 'Location',
       'Period', 'Dim1', 'Dim2', 'FactValueNumeric'],
      dtype='object')

In [6]:
ten_economies=['China','United States of America','India','Japan','Germany','Russian Federation','Brazil','Indonesia','United Kingdom of Great Britain and Northern Ireland','France','Mexico']
#its going to 11 countries

In [7]:
df=df[df['Location'].isin(ten_economies)]

['China','United States of America','India','Japan','Germany','Russian Federation','Brazil','Indonesia','United Kingdom of Great Britain and Northern Ireland','France','Mexico']

In [8]:
df.columns = [s.strip().replace(' ', '_') for s in df.columns]
df=df.rename(columns={'Dim1':'Sex','Dim2':'Age_Group','ParentLocationCode':'Region_Code'})
df=df.sort_values(by=['Location','Period','Age_Group','Sex'])

In [11]:
df.columns

Index(['Indicator', 'Region_Code', 'SpatialDimValueCode', 'Location', 'Period',
       'Sex', 'Age_Group', 'FactValueNumeric'],
      dtype='object')

In [12]:
df1 = pd.get_dummies(df['Indicator'], prefix_sep='_', drop_first=True)
df1

Unnamed: 0,ex - expectation of life at age x,lx - number of people left alive at age x,nLx - person-years lived between ages x and x+n,nMx - age-specific death rate between ages x and x+n,ndx - number of people dying between ages x and x+n,nqx - probability of dying between ages x and x+n
195954,0,0,0,1,0,0
202660,0,0,0,0,0,1
214506,0,1,0,0,0,0
219508,0,0,0,0,1,0
224970,0,0,1,0,0,0
...,...,...,...,...,...,...
14269,0,1,0,0,0,0
26205,0,0,0,0,1,0
34670,0,0,1,0,0,0
41010,0,0,0,0,0,0


In [13]:
df=df.reset_index(drop=True)
df

Unnamed: 0,Indicator,Region_Code,SpatialDimValueCode,Location,Period,Sex,Age_Group,FactValueNumeric
0,nMx - age-specific death rate between ages x a...,AMR,BRA,Brazil,2000,Female,1-4 years,0.001
1,nqx - probability of dying between ages x and x+n,AMR,BRA,Brazil,2000,Female,1-4 years,0.004
2,lx - number of people left alive at age x,AMR,BRA,Brazil,2000,Female,1-4 years,97402.000
3,ndx - number of people dying between ages x an...,AMR,BRA,Brazil,2000,Female,1-4 years,392.300
4,nLx - person-years lived between ages x and x+n,AMR,BRA,Brazil,2000,Female,1-4 years,388666.000
...,...,...,...,...,...,...,...,...
14625,lx - number of people left alive at age x,AMR,USA,United States of America,2019,Male,<1 year,100000.000
14626,ndx - number of people dying between ages x an...,AMR,USA,United States of America,2019,Male,<1 year,596.700
14627,nLx - person-years lived between ages x and x+n,AMR,USA,United States of America,2019,Male,<1 year,99463.000
14628,Tx - person-years lived above age x,AMR,USA,United States of America,2019,Male,<1 year,7628125.000


In [14]:
df2= df.pivot_table(index=['Region_Code', 'SpatialDimValueCode', 'Location', 'Period', 'Sex', 'Age_Group'],
                             columns='Indicator',
                             values='FactValueNumeric',
                             aggfunc='first').reset_index()
df2


Indicator,Region_Code,SpatialDimValueCode,Location,Period,Sex,Age_Group,Tx - person-years lived above age x,ex - expectation of life at age x,lx - number of people left alive at age x,nLx - person-years lived between ages x and x+n,nMx - age-specific death rate between ages x and x+n,ndx - number of people dying between ages x and x+n,nqx - probability of dying between ages x and x+n
0,AMR,BRA,Brazil,2000,Female,1-4 years,7420561.0,76.18,97402.0,388666.0,0.0010,392.3,0.004
1,AMR,BRA,Brazil,2000,Female,10-14 years,6547182.0,67.58,96875.0,484035.0,0.0003,137.0,0.001
2,AMR,BRA,Brazil,2000,Female,15-19 years,6063147.0,62.68,96739.0,483101.0,0.0005,236.7,0.002
3,AMR,BRA,Brazil,2000,Female,20-24 years,5580047.0,57.82,96502.0,481748.0,0.0006,304.5,0.003
4,AMR,BRA,Brazil,2000,Female,25-29 years,5098299.0,53.00,96197.0,480046.0,0.0008,376.4,0.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2085,WPR,JPN,Japan,2019,Male,70-74 years,1345405.0,15.97,84223.0,402477.0,0.0190,7455.0,0.089
2086,WPR,JPN,Japan,2019,Male,75-79 years,942929.0,12.28,76768.0,354743.0,0.0330,11638.0,0.150
2087,WPR,JPN,Japan,2019,Male,80-84 years,588185.0,9.03,65130.0,282238.0,0.0620,17364.0,0.270
2088,WPR,JPN,Japan,2019,Male,85+ years,305948.0,6.41,47765.0,305948.0,0.1400,47765.0,1.000


In [15]:
# df2=df.groupby(['Region_Code','SpatialDimValueCode','Location','Period','Sex','Age_Group'])[['Death_Rate']]
# df2

In [16]:



# Assuming your DataFrame is named 'df'
# Replace 'your_dataset.csv' with the actual path or URL of your dataset file
# df = pd.read_csv('your_dataset.csv')

# List of categorical columns to denormalize
categorical_columns = ['Region_Code', 'SpatialDimValueCode', 'Location', 'Sex', 'Age_Group','Indicator']

# Iterate through each categorical column and denormalize
for col in categorical_columns:
    df3 = df.pivot_table(index=[c for c in df.columns if c not in [col, 'FactValueNumeric']],
                        columns=col,
                        values='FactValueNumeric',
                        aggfunc='first').reset_index()

# Display the denormalized DataFrame
df3


Indicator,Region_Code,SpatialDimValueCode,Location,Period,Sex,Age_Group,Tx - person-years lived above age x,ex - expectation of life at age x,lx - number of people left alive at age x,nLx - person-years lived between ages x and x+n,nMx - age-specific death rate between ages x and x+n,ndx - number of people dying between ages x and x+n,nqx - probability of dying between ages x and x+n
0,AMR,BRA,Brazil,2000,Female,1-4 years,7420561.0,76.18,97402.0,388666.0,0.0010,392.3,0.004
1,AMR,BRA,Brazil,2000,Female,10-14 years,6547182.0,67.58,96875.0,484035.0,0.0003,137.0,0.001
2,AMR,BRA,Brazil,2000,Female,15-19 years,6063147.0,62.68,96739.0,483101.0,0.0005,236.7,0.002
3,AMR,BRA,Brazil,2000,Female,20-24 years,5580047.0,57.82,96502.0,481748.0,0.0006,304.5,0.003
4,AMR,BRA,Brazil,2000,Female,25-29 years,5098299.0,53.00,96197.0,480046.0,0.0008,376.4,0.004
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2085,WPR,JPN,Japan,2019,Male,70-74 years,1345405.0,15.97,84223.0,402477.0,0.0190,7455.0,0.089
2086,WPR,JPN,Japan,2019,Male,75-79 years,942929.0,12.28,76768.0,354743.0,0.0330,11638.0,0.150
2087,WPR,JPN,Japan,2019,Male,80-84 years,588185.0,9.03,65130.0,282238.0,0.0620,17364.0,0.270
2088,WPR,JPN,Japan,2019,Male,85+ years,305948.0,6.41,47765.0,305948.0,0.1400,47765.0,1.000


In [37]:
# df_merged.to_csv('final1.csv', index=True)

In [19]:
df3.Location.unique()

array(['Brazil', 'Mexico', 'United States of America', 'Germany',
       'France', 'United Kingdom of Great Britain and Northern Ireland',
       'Russian Federation', 'Indonesia', 'India', 'China', 'Japan'],
      dtype=object)

GDP = [{'Location':['Brazil', 'Mexico', 'United States of America', 'Germany',
       'France', 'United Kingdom of Great Britain and Northern Ireland',
       'Russian Federation', 'Indonesia', 'India', 'China', 'Japan'],'GDP':[]}]

2000, 2005, 2010, 2015, 2019

In [33]:
GDP_df=pd.read_csv("national-gdp-penn-world-table.csv")
GDP_df=GDP_df.drop(columns='Entity')
GDP_df= GDP_df.reset_index(drop=True)
GDP_df= GDP_df.rename(columns={"Code":"SpatialDimValueCode","Year":"Period","GDP (output, multiple price benchmarks)":"GDP"})
GDP_df

Unnamed: 0,SpatialDimValueCode,Period,GDP
0,ALB,1971,6958496300
1,ALB,1972,7246038000
2,ALB,1973,7536605700
3,ALB,1974,7855124000
4,ALB,1975,8164371000
...,...,...,...
10103,ZWE,2015,39798645000
10104,ZWE,2016,40963190000
10105,ZWE,2017,44316740000
10106,ZWE,2018,43420897000


In [35]:
df_merged = pd.merge(df3, GDP_df, on=['SpatialDimValueCode', 'Period'], how='left')
df_merged

Unnamed: 0,Region_Code,SpatialDimValueCode,Location,Period,Sex,Age_Group,Tx - person-years lived above age x,ex - expectation of life at age x,lx - number of people left alive at age x,nLx - person-years lived between ages x and x+n,nMx - age-specific death rate between ages x and x+n,ndx - number of people dying between ages x and x+n,nqx - probability of dying between ages x and x+n,GDP
0,AMR,BRA,Brazil,2000,Female,1-4 years,7420561.0,76.18,97402.0,388666.0,0.0010,392.3,0.004,1653572400000
1,AMR,BRA,Brazil,2000,Female,10-14 years,6547182.0,67.58,96875.0,484035.0,0.0003,137.0,0.001,1653572400000
2,AMR,BRA,Brazil,2000,Female,15-19 years,6063147.0,62.68,96739.0,483101.0,0.0005,236.7,0.002,1653572400000
3,AMR,BRA,Brazil,2000,Female,20-24 years,5580047.0,57.82,96502.0,481748.0,0.0006,304.5,0.003,1653572400000
4,AMR,BRA,Brazil,2000,Female,25-29 years,5098299.0,53.00,96197.0,480046.0,0.0008,376.4,0.004,1653572400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2085,WPR,JPN,Japan,2019,Male,70-74 years,1345405.0,15.97,84223.0,402477.0,0.0190,7455.0,0.089,5036891000000
2086,WPR,JPN,Japan,2019,Male,75-79 years,942929.0,12.28,76768.0,354743.0,0.0330,11638.0,0.150,5036891000000
2087,WPR,JPN,Japan,2019,Male,80-84 years,588185.0,9.03,65130.0,282238.0,0.0620,17364.0,0.270,5036891000000
2088,WPR,JPN,Japan,2019,Male,85+ years,305948.0,6.41,47765.0,305948.0,0.1400,47765.0,1.000,5036891000000


In [33]:
nc_df = pd.read_csv("NonCommunicable_Disease_dataset.csv")
nc_df

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,EUR,Europe,Country,RUS,Russian Federation,Year,2019,...,,,797625.00,,1203905.0,1¬†003¬†695 [797¬†625 ‚Äì 1¬†203¬†905],,,EN,2021-02-09T08:00:00.000Z
1,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,WPR,Western Pacific,Country,CHN,China,Year,2019,...,,,807008.00,,1299036.0,1¬†013¬†621 [807¬†008 ‚Äì 1¬†299¬†036],,,EN,2021-02-09T08:00:00.000Z
2,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,SEAR,South-East Asia,Country,IND,India,Year,2019,...,,,821773.00,,1436096.0,1¬†115¬†687 [821¬†773 ‚Äì 1¬†436¬†096],,,EN,2021-02-09T08:00:00.000Z
3,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,SEAR,South-East Asia,Country,IND,India,Year,2019,...,,,765727.00,,1535106.0,1¬†146¬†371 [765¬†727 ‚Äì 1¬†535¬†106],,,EN,2021-02-09T08:00:00.000Z
4,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,SEAR,South-East Asia,Country,IND,India,Year,2019,...,,,1111032.00,,1849975.0,1¬†450¬†806 [1¬†111¬†032 ‚Äì 1¬†849¬†975],,,EN,2021-02-09T08:00:00.000Z
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43915,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,AFR,Africa,Country,SYC,Seychelles,Year,2000,...,,,72.37,,128.6,99.05 [72.37 ‚Äì 128.6],,,EN,2021-02-09T08:00:00.000Z
43916,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,WPR,Western Pacific,Country,SLB,Solomon Islands,Year,2000,...,,,55.73,,164.5,99.3 [55.73 ‚Äì 164.5],,,EN,2021-02-09T08:00:00.000Z
43917,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,SEAR,South-East Asia,Country,MDV,Maldives,Year,2000,...,,,60.51,,155.4,99.62 [60.51 ‚Äì 155.4],,,EN,2021-02-09T08:00:00.000Z
43918,SDG_SH_DTH_RNCOM,Number of deaths attributed to non-communicabl...,numeric,AFR,Africa,Country,BFA,Burkina Faso,Year,2000,...,,,520.40,,1683.0,990.5 [520.4 ‚Äì 1683],,,EN,2021-02-09T08:00:00.000Z


In [34]:
nc_df= nc_df[['ParentLocationCode','SpatialDimValueCode','Location','Period','Dim1','Dim2','FactValueNumeric','FactValueNumericLow','FactValueNumericHigh']]
nc_df

Unnamed: 0,ParentLocationCode,SpatialDimValueCode,Location,Period,Dim1,Dim2,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
0,EUR,RUS,Russian Federation,2019,Both sexes,Cardiovascular diseases,1003695.00,797625.00,1203905.0
1,WPR,CHN,China,2019,Both sexes,Respiratory diseases,1013621.00,807008.00,1299036.0
2,SEAR,IND,India,2019,Female,Cardiovascular diseases,1115687.00,821773.00,1436096.0
3,SEAR,IND,India,2019,Both sexes,Respiratory diseases,1146371.00,765727.00,1535106.0
4,SEAR,IND,India,2019,Male,Cardiovascular diseases,1450806.00,1111032.00,1849975.0
...,...,...,...,...,...,...,...,...,...
43915,AFR,SYC,Seychelles,2000,Female,Cardiovascular diseases,99.05,72.37,128.6
43916,WPR,SLB,Solomon Islands,2000,Male,Respiratory diseases,99.30,55.73,164.5
43917,SEAR,MDV,Maldives,2000,Male,Malignant neoplasms,99.62,60.51,155.4
43918,AFR,BFA,Burkina Faso,2000,Female,Respiratory diseases,990.50,520.40,1683.0


In [41]:
nc_df.IsLatestYear.unique()

array([ True, False])

In [39]:
nc_df.columns

Index(['IndicatorCode', 'Indicator', 'ValueType', 'ParentLocationCode',
       'ParentLocation', 'Location type', 'SpatialDimValueCode', 'Location',
       'Period type', 'Period', 'IsLatestYear', 'Dim1 type', 'Dim1',
       'Dim1ValueCode', 'Dim2 type', 'Dim2', 'Dim2ValueCode', 'Dim3 type',
       'Dim3', 'Dim3ValueCode', 'DataSourceDimValueCode', 'DataSource',
       'FactValueNumericPrefix', 'FactValueNumeric', 'FactValueUoM',
       'FactValueNumericLowPrefix', 'FactValueNumericLow',
       'FactValueNumericHighPrefix', 'FactValueNumericHigh', 'Value',
       'FactValueTranslationID', 'FactComments', 'Language', 'DateModified'],
      dtype='object')

In [35]:
both_sexes_mask = nc_df['Dim1'] == 'Both sexes'

# Create separate DataFrames for 'Male' and 'Female'
df_male = nc_df.copy()
df_female = nc_df.copy()

# Update 'Dim1' column in each DataFrame
df_male.loc[both_sexes_mask, 'Dim1'] = 'Male'
df_female.loc[both_sexes_mask, 'Dim1'] = 'Female'

# Concatenate the three DataFrames
df_result = pd.concat([nc_df[~both_sexes_mask], df_male[both_sexes_mask], df_female[both_sexes_mask]])
df_result = df_result.rename(columns={'Dim1':'Sex','Dim2':'non-communicable_diseases','ParentLocationCode':'Region_Code'})
# Reset index
df_result.reset_index(drop=True, inplace=True)
df_result

Unnamed: 0,Region_Code,SpatialDimValueCode,Location,Period,Sex,non-communicable_diseases,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
0,SEAR,IND,India,2019,Female,Cardiovascular diseases,1115687.00,821773.00,1436096.0
1,SEAR,IND,India,2019,Male,Cardiovascular diseases,1450806.00,1111032.00,1849975.0
2,WPR,CHN,China,2019,Male,Malignant neoplasms,1659677.00,1277365.00,2086563.0
3,WPR,CHN,China,2019,Female,Cardiovascular diseases,1877096.00,1463568.00,2282684.0
4,AFR,ZWE,Zimbabwe,2019,Female,Cardiovascular diseases,10035.00,6031.00,15524.0
...,...,...,...,...,...,...,...,...,...
58555,AFR,COD,Democratic Republic of the Congo,2000,Female,Diabetes mellitus,9705.00,5899.00,14976.0
58556,EUR,BIH,Bosnia and Herzegovina,2000,Female,Diabetes mellitus,974.70,657.20,1384.0
58557,EUR,MLT,Malta,2000,Female,Diabetes mellitus,98.27,72.79,130.2
58558,EUR,LUX,Luxembourg,2000,Female,Malignant neoplasms,982.60,783.50,1206.0


In [36]:
df_result = df_result.groupby(['Region_Code','SpatialDimValueCode','Location', 'Period','Sex','non-communicable_diseases'], as_index=False)[['FactValueNumeric','FactValueNumericLow','FactValueNumericHigh']].sum()
df_result=df_result[df_result['Location'].isin(ten_economies)]
df_result

Unnamed: 0,Region_Code,SpatialDimValueCode,Location,Period,Sex,non-communicable_diseases,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
8320,AMR,BRA,Brazil,2000,Female,Cardiovascular diseases,442596.0,386808.0,487343.0
8321,AMR,BRA,Brazil,2000,Female,Diabetes mellitus,65949.0,56471.0,75314.0
8322,AMR,BRA,Brazil,2000,Female,Malignant neoplasms,207022.0,184424.0,228142.0
8323,AMR,BRA,Brazil,2000,Female,Respiratory diseases,111660.0,90305.0,123246.0
8324,AMR,BRA,Brazil,2000,Male,Cardiovascular diseases,459255.0,405489.0,504280.0
...,...,...,...,...,...,...,...,...,...
26875,WPR,JPN,Japan,2019,Female,Respiratory diseases,183848.0,111046.0,268100.0
26876,WPR,JPN,Japan,2019,Male,Cardiovascular diseases,528323.0,409896.0,615647.0
26877,WPR,JPN,Japan,2019,Male,Diabetes mellitus,23543.0,16425.0,30587.0
26878,WPR,JPN,Japan,2019,Male,Malignant neoplasms,634851.0,520942.0,713168.0


Unnamed: 0,Region_Code,SpatialDimValueCode,Location,Period,Sex,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh,non-communicable_diseases_Cardiovascular diseases,non-communicable_diseases_Diabetes mellitus,non-communicable_diseases_Malignant neoplasms,non-communicable_diseases_Respiratory diseases
0,AMR,BRA,Brazil,2000,Female,442596.0,386808.0,487343.0,1,0,0,0
1,AMR,BRA,Brazil,2000,Female,65949.0,56471.0,75314.0,0,1,0,0
2,AMR,BRA,Brazil,2000,Female,207022.0,184424.0,228142.0,0,0,1,0
3,AMR,BRA,Brazil,2000,Female,111660.0,90305.0,123246.0,0,0,0,1
4,AMR,BRA,Brazil,2000,Male,459255.0,405489.0,504280.0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1755,WPR,JPN,Japan,2019,Female,183848.0,111046.0,268100.0,0,0,0,1
1756,WPR,JPN,Japan,2019,Male,528323.0,409896.0,615647.0,1,0,0,0
1757,WPR,JPN,Japan,2019,Male,23543.0,16425.0,30587.0,0,1,0,0
1758,WPR,JPN,Japan,2019,Male,634851.0,520942.0,713168.0,0,0,1,0


In [38]:
df_result=df_result.reset_index(drop=True)
df_result

Unnamed: 0,Region_Code,SpatialDimValueCode,Location,Period,Sex,non-communicable_diseases,FactValueNumeric,FactValueNumericLow,FactValueNumericHigh
0,AMR,BRA,Brazil,2000,Female,Cardiovascular diseases,442596.0,386808.0,487343.0
1,AMR,BRA,Brazil,2000,Female,Diabetes mellitus,65949.0,56471.0,75314.0
2,AMR,BRA,Brazil,2000,Female,Malignant neoplasms,207022.0,184424.0,228142.0
3,AMR,BRA,Brazil,2000,Female,Respiratory diseases,111660.0,90305.0,123246.0
4,AMR,BRA,Brazil,2000,Male,Cardiovascular diseases,459255.0,405489.0,504280.0
...,...,...,...,...,...,...,...,...,...
1755,WPR,JPN,Japan,2019,Female,Respiratory diseases,183848.0,111046.0,268100.0
1756,WPR,JPN,Japan,2019,Male,Cardiovascular diseases,528323.0,409896.0,615647.0
1757,WPR,JPN,Japan,2019,Male,Diabetes mellitus,23543.0,16425.0,30587.0
1758,WPR,JPN,Japan,2019,Male,Malignant neoplasms,634851.0,520942.0,713168.0


In [7]:
# df_result.to_csv('non-communicable_deseases.csv', index=True)

Unnamed: 0_level_0,Age_Child,Age_Teen,Gender_Male,Prescription_Myope,Prescription_Normal,Condition_Normal,Condition_Strong,Astigmatic_Yes,Tear Production_Reduced
Patient #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,0,1,1,0,0,0,1,1
2,1,0,0,0,0,0,1,0,0
3,0,1,1,0,1,1,0,0,1


The Simple Matching Coefficient distance between objects 1 and 2 is: 1.00
