In [3]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import svm, preprocessing

In [7]:
# Reading the dataset and showing the 1st 5 rows

df = pd.read_csv('dataset-noshow.csv')
df.head()

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
0,29872500000000.0,5642903,F,2016-04-29T18:38:08Z,2016-04-29T00:00:00Z,62,JARDIM DA PENHA,0,1,0,0,0,0,No
1,558997800000000.0,5642503,M,2016-04-29T16:08:27Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,0,0,0,0,0,No
2,4262962000000.0,5642549,F,2016-04-29T16:19:04Z,2016-04-29T00:00:00Z,62,MATA DA PRAIA,0,0,0,0,0,0,No
3,867951200000.0,5642828,F,2016-04-29T17:29:31Z,2016-04-29T00:00:00Z,8,PONTAL DE CAMBURI,0,0,0,0,0,0,No
4,8841186000000.0,5642494,F,2016-04-29T16:07:23Z,2016-04-29T00:00:00Z,56,JARDIM DA PENHA,0,1,1,0,0,0,No


In [9]:
# Amount of rows in the dataset

df.shape

(110527, 14)

In [13]:
# Checking for null values, there seems to be no null fields

df.isnull().sum()

PatientId         0
AppointmentID     0
Gender            0
ScheduledDay      0
AppointmentDay    0
Age               0
Neighbourhood     0
Scholarship       0
Hipertension      0
Diabetes          0
Alcoholism        0
Handcap           0
SMS_received      0
No-show           0
dtype: int64

In [15]:
# Trying to explore the dataset, the data looks great with no nulls. However a few odd values for ages that are below
# 0 appear, and even 

for col in df.columns[2:]:
    print(col)
    print(df[col].unique())

Gender
['F' 'M']
ScheduledDay
['2016-04-29T18:38:08Z' '2016-04-29T16:08:27Z' '2016-04-29T16:19:04Z' ...
 '2016-04-27T16:03:52Z' '2016-04-27T15:09:23Z' '2016-04-27T13:30:56Z']
AppointmentDay
['2016-04-29T00:00:00Z' '2016-05-03T00:00:00Z' '2016-05-10T00:00:00Z'
 '2016-05-17T00:00:00Z' '2016-05-24T00:00:00Z' '2016-05-31T00:00:00Z'
 '2016-05-02T00:00:00Z' '2016-05-30T00:00:00Z' '2016-05-16T00:00:00Z'
 '2016-05-04T00:00:00Z' '2016-05-19T00:00:00Z' '2016-05-12T00:00:00Z'
 '2016-05-06T00:00:00Z' '2016-05-20T00:00:00Z' '2016-05-05T00:00:00Z'
 '2016-05-13T00:00:00Z' '2016-05-09T00:00:00Z' '2016-05-25T00:00:00Z'
 '2016-05-11T00:00:00Z' '2016-05-18T00:00:00Z' '2016-05-14T00:00:00Z'
 '2016-06-02T00:00:00Z' '2016-06-03T00:00:00Z' '2016-06-06T00:00:00Z'
 '2016-06-07T00:00:00Z' '2016-06-01T00:00:00Z' '2016-06-08T00:00:00Z']
Age
[ 62  56   8  76  23  39  21  19  30  29  22  28  54  15  50  40  46   4
  13  65  45  51  32  12  61  38  79  18  63  64  85  59  55  71  49  78
  31  58  27   6   2  11   7 

In [17]:
# The mean age of alcohol consumption ranged from 14-17 in Brazil (Patterns of alcohol use among Brazilian adolescents [https://www.scielo.br/j/rbp/a/rsHyR7TV7dxqpGrrmz5csNg/#])
# therefore ages less than this will be removed
filter = (df.Age < 14) & (df.Alcoholism == 1)
df[filter]

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show
6402,499114300000.0,5683456,M,2016-05-11T07:32:26Z,2016-05-13T00:00:00Z,11,PARQUE MOSCOSO,1,0,0,1,0,0,No
17463,3668727000000.0,5693519,M,2016-05-13T07:21:38Z,2016-05-17T00:00:00Z,7,REDENÇÃO,0,0,0,1,0,0,No
17905,57176820000000.0,5647970,M,2016-05-02T14:47:48Z,2016-05-11T00:00:00Z,13,SANTA MARTHA,0,0,0,1,0,1,No
32817,2651852000000.0,5623088,M,2016-04-26T13:32:34Z,2016-05-12T00:00:00Z,5,RESISTÊNCIA,0,0,0,1,0,1,Yes
53536,34849790000000.0,5726323,M,2016-05-20T13:30:05Z,2016-05-20T00:00:00Z,12,SANTOS REIS,0,0,0,1,0,0,Yes
53549,34849790000000.0,5694436,M,2016-05-13T08:38:50Z,2016-05-13T00:00:00Z,12,SANTOS REIS,0,0,0,1,0,0,Yes
53587,34849790000000.0,5669570,M,2016-05-06T11:08:48Z,2016-05-06T00:00:00Z,12,SANTOS REIS,0,0,0,1,0,0,Yes
53733,34849790000000.0,5726706,M,2016-05-20T14:10:56Z,2016-05-20T00:00:00Z,12,SANTOS REIS,0,0,0,1,0,0,Yes
53746,34849790000000.0,5694587,M,2016-05-13T08:47:34Z,2016-05-13T00:00:00Z,12,SANTOS REIS,0,0,0,1,0,0,Yes
54757,34849790000000.0,5694703,M,2016-05-13T08:54:09Z,2016-05-13T00:00:00Z,12,SANTOS REIS,0,0,0,1,0,0,Yes
