In [1]:
# Importing required libraries

import pandas as pd
import warnings
warnings.filterwarnings('ignore')
import ipywidgets as widgets
from IPython.display import display, clear_output
import io

In [2]:
#!jupyter nbextension enable --py widgetsnbextension --sys-prefix
#!jupyter serverextension enable voila --sys-prefix

In [3]:
# File Upload

Input = widgets.FileUpload(
        accept = '.csv',
        multiple = False,
        layout = widgets.Layout(width = '75%')
    )

inputs1 = widgets.HBox([Input], 
                       layout = widgets.Layout(
                        justify_content = 'center',
                        width = '100%'
                        ))

In [4]:
# Calculate button

calculate = widgets.Button(
            description = 'Calculate',
            layout = widgets.Layout(width = '100%')
        )

In [5]:
def perform_Topsis(df, weights, impacts):

    ws = 0
    for i in range(len(weights)):
        try:
            weights[i] = float(weights[i])
            ws+=weights[i]
        except:    
            print('\nWeights must be numeric and comma separated, Exiting....')
            exit(0)

    weights = [(i/ws) for i in weights]

    for i in impacts:
        if(i not in ['+','-']):
            print('\nImpacts must be either positive or negative and comma separated, Exiting....')
            exit(0)
    
    if(len(df.columns)<3):
        print('\nMinimum 3 columns required, Exiting....')
        exit(0)

    if(df.isnull().values.any()):
        print('\nNull Value Detected, Exiting....')
        exit(0)

    for i in range(len(df)):
        for j in range(1, len(df.columns)):
            try:
                df.iloc[i, j] = float(df.iloc[i, j])
            except:
                print('\nNot numeric value detected in column number', j+1, ', Exiting....')
                exit(0)
            
    if(len(impacts)!=len(df.columns)-1):
        print('\n',len(df.columns)-1,' impacts were required instead ', len(impacts),' were given, Exiting....')
        exit(0)

    if(len(weights)!=len(df.columns)-1):
        print('\n',len(df.columns)-1,' weights were required instead ', len(weights),' were given, Exiting....')
        exit(0)
    
    print('\nGenerating output file ....')
    
    temp = df.copy()
    temp['Splus'] = 0
    temp['Sminus'] = 0
    ib = []
    iw = []

    for i in range(1, len(temp.columns)-2):
        SQ = (sum(temp.iloc[:,i]**2))**0.5
        temp.iloc[:,i] = ((temp.iloc[:,i]/SQ)*weights[i-1])
        if(impacts[i-1] == '+'):
            ib.append(max(temp.iloc[:,i]))
            iw.append(min(temp.iloc[:,i]))
        else:
            ib.append(min(temp.iloc[:,i]))
            iw.append(max(temp.iloc[:,i]))

    for i in range(len(temp)):
        temp.iloc[i, len(temp.columns)-2] = ((sum((temp.iloc[i,1:len(temp.columns)-2]-ib)**2))**0.5)
        temp.iloc[i, len(temp.columns)-1] = ((sum((temp.iloc[i,1:len(temp.columns)-2]-iw)**2))**0.5)

    df['Topsis Score'] = 0
    df['Rank'] = 0

    for i in range(len(temp)):
        SUM = temp.iloc[i, len(temp.columns)-1] + temp.iloc[i, len(temp.columns)-2] 
        df.iloc[i, len(temp.columns)-2] = temp.iloc[i, len(temp.columns)-1]/SUM

    df['Rank'] = df['Topsis Score'].rank(ascending = False, method='dense')

    print(df)
    df.to_csv("output.csv", index = False)

    print('\nOutput file generated.')

In [6]:
# On button click function definition

def on_button_clicked(event):
    with output:
        clear_output()
        df = pd.read_csv(io.BytesIO(Input.value[list(Input.value.keys())[0]]['content']))
        perform_Topsis(df, [1,2,1,2], ['+','-','-','+'])

In [7]:
calculate.on_click(on_button_clicked)

In [8]:
# Output widget

output = widgets.Output()

OutputHbox = widgets.HBox([output],
                        layout = widgets.Layout(
                        justify_content = 'center',
                        )
                    )

In [9]:
# Line breaker

text_0 = widgets.HTML(value = "<h1></h1>", 
                     layout = widgets.Layout(
                     align_items = 'center',
                     )
                    )

In [10]:
# Heading

text_1 = widgets.HTML(value = "<h1><b><center>Medical Cost Personal Datasets</center></b></h1>")

# Sub heading

text_2 = widgets.HTML(value = "<h3><center>Insurance Forecast by using Hybrid Machine Learning.</center></h3>")

headings = widgets.VBox([text_1, text_2, text_0])

# About dataset

text_4 = widgets.HTML(value = """
<table style="width: 100%; border-collapse: collapse; border-style: solid; border-color: black;" border="2" cellpadding="25">
<tbody>
<tr>
<td style="width: 100%;">
<h2 style="text-align: justify;">About the Dataset</h2>
<p style="text-align: justify;">This dataset was originally published in book <strong>"Machine Learning with R by Brett Lantz"</strong>. It consists of 1338 records and 7 features for prediction of insurance charges in United States. The 6 independent features can be used to predict the dependent feature 'charges'.</p>
<p style="text-align: justify;"><strong>The dataset contains 6 independent features as listed below:</strong></p>
<ul style="text-align: justify;">
<li>
<p><strong>age:</strong> age of primary beneficiary (10 years - 100 years)</p>
</li>
<li>
<p><strong>sex:</strong> insurance contractor gender, female, male</p>
</li>
<li>
<p><strong>bmi:</strong> Body mass index, providing an understanding of body, weights that are relatively high or low relative to height, objective index of body weight (kg / m ^ 2) using the ratio of height to weight (18 - 30)</p>
</li>
<li>
<p><strong>children:</strong> Number of children covered by health insurance / Number of dependents (0 - 6)</p>
</li>
<li>
<p><strong>smoker:</strong> smoker/non-smoker</p>
</li>
<li>
<p><strong>region:</strong> the beneficiary's residential area in the US, northeast, southeast, southwest, northwest</p>
</li>
<li>
<p><strong>charges:</strong> Individual medical costs billed by health insurance</p>
</li>
</ul>
<p style="text-align: justify;">These features are used to predict the yearly charges incurred (in $) by an individual for availing the insurance facility.</p>
</td>
</tr>
</tbody>
</table>
<h3 style="text-align: justify;">Sample records from the dataset:</h3>
<table style="border-collapse: collapse; width: 455pt; border-style: solid; border-color: black;" border="1" width="455pt" cellspacing="0" cellpadding="0">
<tbody>
<tr style="height: 16.0pt;">
<td style="height: 16pt; width: 84.4375px; text-align: center;" height="67">age</td>
<td style="width: 85.2344px; text-align: center;">sex</td>
<td style="width: 85.3125px; text-align: center;">bmi</td>
<td style="width: 85.5312px; text-align: center;">children</td>
<td style="width: 85.3906px; text-align: center;">smoker</td>
<td style="width: 86.1094px; text-align: center;">region</td>
<td style="width: 86.6406px; text-align: center;">charges</td>
</tr>
<tr style="height: 16pt; text-align: center;">
<td style="height: 16pt; width: 84.4375px; text-align: center;" align="right" height="67">19</td>
<td style="width: 85.2344px; text-align: center;">female</td>
<td style="width: 85.3125px; text-align: center;" align="right">27.9</td>
<td style="width: 85.5312px; text-align: center;" align="right">0</td>
<td style="width: 85.3906px; text-align: center;">yes</td>
<td style="width: 86.1094px; text-align: center;">southwest</td>
<td style="width: 86.6406px; text-align: center;" align="right">16884.924</td>
</tr>
<tr style="height: 16pt; text-align: center;">
<td style="height: 16pt; width: 84.4375px; text-align: center;" align="right" height="67">18</td>
<td style="width: 85.2344px; text-align: center;">male</td>
<td style="width: 85.3125px; text-align: center;" align="right">33.77</td>
<td style="width: 85.5312px; text-align: center;" align="right">1</td>
<td style="width: 85.3906px; text-align: center;">no</td>
<td style="width: 86.1094px; text-align: center;">southeast</td>
<td style="width: 86.6406px; text-align: center;" align="right">1725.5523</td>
</tr>
<tr style="height: 16pt; text-align: center;">
<td style="height: 16pt; width: 84.4375px; text-align: center;" align="right" height="67">28</td>
<td style="width: 85.2344px; text-align: center;">male</td>
<td style="width: 85.3125px; text-align: center;" align="right">33</td>
<td style="width: 85.5312px; text-align: center;" align="right">3</td>
<td style="width: 85.3906px; text-align: center;">no</td>
<td style="width: 86.1094px; text-align: center;">southeast</td>
<td style="width: 86.6406px; text-align: center;" align="right">4449.462</td>
</tr>
<tr style="height: 16pt; text-align: center;">
<td style="height: 16pt; width: 84.4375px; text-align: center;" align="right" height="67">33</td>
<td style="width: 85.2344px; text-align: center;">male</td>
<td style="width: 85.3125px; text-align: center;" align="right">22.705</td>
<td style="width: 85.5312px; text-align: center;" align="right">0</td>
<td style="width: 85.3906px; text-align: center;">no</td>
<td style="width: 86.1094px; text-align: center;">northwest</td>
<td style="width: 86.6406px; text-align: center;" align="right">21984.4706</td>
</tr>
<tr style="height: 16pt; text-align: center;">
<td style="height: 16pt; width: 84.4375px; text-align: center;" align="right" height="67">32</td>
<td style="width: 85.2344px; text-align: center;">male</td>
<td style="width: 85.3125px; text-align: center;" align="right">28.88</td>
<td style="width: 85.5312px; text-align: center;" align="right">0</td>
<td style="width: 85.3906px; text-align: center;">no</td>
<td style="width: 86.1094px; text-align: center;">northwest</td>
<td style="width: 86.6406px; text-align: center;" align="right">3866.8552</td>
</tr>
</tbody>
</table>
""")

text_5 = widgets.HTML(value = """
<h2>Hybrid Approach Used: Clustering + Linear Regression</h2>
<p style="text-align: justify;">A variety of different regression models were tested for the given dataset including Linear, Huber, Orthogonal Matching Point, etc. (refer table below) and linear regression (multivarialte) was found to be the best model among all applied models having a R2 score of 0.7466. In statistics, linear regression is a linear approach for modelling the relationship between a scalar response and one or more explanatory variables.</p>
<p>&nbsp;</p>
""")

text_7 = widgets.HTML(value = """
<h4>* LM: Linear Model</h4>
<p style="text-align: justify;">Combining with clustering, and applying a hybrid approach (refer figure above) the R2 square is furthur increased by 12.06% to 0.83671. The approach consists of splitting the original dataset into diffrent clusters and then using a linear model for each cluster. The optimal number of clusters have been identified by varing cluster numbers from 2 to 30, and analysing the R2 score for predictiond on test data. For new unseen data, firstly the cluster is predicted and then the linear model corresponding to that particular cluster is applied to obtain the final output.</p>
<p>&nbsp;</p>
""")

# Image 1

image1 = open("image1.png", "rb")
image1 = image1.read()
image1 = widgets.Image(
    value = image1,
    format = 'png',
    width = 1000,
    height = 600,
)


# Image 2

image2 = open("image2.png", "rb")
image2 = image2.read()
image2 = widgets.Image(
    value = image2,
    format = 'png',
    width = 800,
    height = 600,
)

ImageHbox1 = widgets.HBox([image1],
                        layout = widgets.Layout(
                        justify_content = 'center',
                        )
                    )

ImageHbox2 = widgets.HBox([image2],
                        layout = widgets.Layout(
                        justify_content = 'center',
                        )
                    )

text_6 = widgets.HTML(value = "<h1><center>Live Demo</center></h1>")

In [11]:
# Displaying rendered Web Page

page = widgets.VBox([text_0, text_1, text_2, text_0, text_0, text_4, text_0, text_5, ImageHbox1, text_0, text_0, ImageHbox2, text_7, text_0, text_6, text_0, text_0, inputs1, text_0, text_0, text_0, calculate, output, text_0, text_0, text_0])
display(page)

VBox(children=(HTML(value='<h1></h1>', layout=Layout(align_items='center')), HTML(value='<h1><b><center>Medica…