CodigoTPI.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1UIesVt6cV9yNsWTFtFhKQFHZyuSZmwJO

# Inteligencia Artificial - Trabajo Práctico Integrador
**Integrantes**
- GETZEL, Martín
- LUCAS, Dania
- OCAMPO, Victoria
- ZAVALA, Alejo

**Bibliografía:**

- [Liu, B., & Mining, W. D. (2011). Web Data Mining. Exploring Hyperlinks, Contents, and Usage Data. Ser. Data-Centric Systems and Applications. Springer Berlin Heidelberg.](https://sirius.cs.put.poznan.pl/~inf89721/Seminarium/Web_Data_Mining__2nd_Edition__Exploring_Hyperlinks__Contents__and_Usage_Data.pdf)
- Apunte de teoría de la materia (Unidad 4 | Páginas finales)

# Data Preparation and Cleaning

In [136]:
# load dataset
import pandas as pd
import matplotlib.pyplot as plt
df = pd.read_csv("data_cardiovascular_risk.csv", index_col="id", dtype={
    "age": "int64",
    "education": "float64",
    "sex": "string",
    "is_smoking": "string",
    "cigsPerDay": "float64",
    "BPMeds": "float64",
    "prevalentStroke": "int64",
    "prevalentHyp": "int64",
    "diabetes": "int64",
    "totChol": "float64",
    "sysBP": "float64",
    "diaBP": "float64",
    "BMI": "float64",
    "heartRate": "float64",
    "glucose": "float64",
    "TenYearCHD": "int"
})

In [137]:
# Visualize the first five rows to get an initial understanding of the data
df.head(5)

Unnamed: 0_level_0,age,education,sex,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,64,2.0,F,YES,3.0,0.0,0,0,0,221.0,148.0,85.0,,90.0,80.0,1
1,36,4.0,M,NO,0.0,0.0,0,1,0,212.0,168.0,98.0,29.77,72.0,75.0,0
2,46,1.0,F,YES,10.0,0.0,0,0,0,250.0,116.0,71.0,20.35,88.0,94.0,0
3,50,1.0,M,YES,20.0,0.0,0,1,0,233.0,158.0,88.0,28.26,68.0,94.0,1
4,64,1.0,F,YES,30.0,0.0,0,0,0,241.0,136.5,85.0,26.42,70.0,77.0,0


In [138]:
df.shape #raws,columns

(3390, 16)

In [139]:
# Get information global about each column data-type and check for null-values
df.info() 

<class 'pandas.core.frame.DataFrame'>
Index: 3390 entries, 0 to 3389
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              3390 non-null   int64  
 1   education        3303 non-null   float64
 2   sex              3390 non-null   string 
 3   is_smoking       3390 non-null   string 
 4   cigsPerDay       3368 non-null   float64
 5   BPMeds           3346 non-null   float64
 6   prevalentStroke  3390 non-null   int64  
 7   prevalentHyp     3390 non-null   int64  
 8   diabetes         3390 non-null   int64  
 9   totChol          3352 non-null   float64
 10  sysBP            3390 non-null   float64
 11  diaBP            3390 non-null   float64
 12  BMI              3376 non-null   float64
 13  heartRate        3389 non-null   float64
 14  glucose          3086 non-null   float64
 15  TenYearCHD       3390 non-null   int64  
dtypes: float64(9), int64(5), string(2)
memory usage: 450.2 KB


In [140]:
# missing data per column
df.isnull().sum() 

age                  0
education           87
sex                  0
is_smoking           0
cigsPerDay          22
BPMeds              44
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             38
sysBP                0
diaBP                0
BMI                 14
heartRate            1
glucose            304
TenYearCHD           0
dtype: int64

In [141]:
# Percentage of missing data per column
def perc_mv(df, col):
    '''
    Args:
        df: The DataFrame.
        column(str): Name of the column to verify.
        
    Returns:
        perc: Percentage of null values in the column.
    '''
    rows = len(df)
    missings = df[col].isnull().sum()
    perc = (missings / rows) * 100
    return perc

In [142]:
list_cols = ['heartRate', 'BMI', 'cigsPerDay', 'totChol', 'education', 'glucose']
for c in list_cols:
    perc_mv_glucose = round(perc_mv(df, f'{c}'), 2)
    print(f'The percentage of missing values for "{c}" is: {perc_mv_glucose}%') #esto se puede hacer más lindo como tabla


The percentage of missing values for "heartRate" is: 0.03%
The percentage of missing values for "BMI" is: 0.41%
The percentage of missing values for "cigsPerDay" is: 0.65%
The percentage of missing values for "totChol" is: 1.12%
The percentage of missing values for "education" is: 2.57%
The percentage of missing values for "glucose" is: 8.97%


In [143]:
df = df.drop(['education'], axis=1) # no sirve para nada
df

Unnamed: 0_level_0,age,sex,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0,64,F,YES,3.0,0.0,0,0,0,221.0,148.0,85.0,,90.0,80.0,1
1,36,M,NO,0.0,0.0,0,1,0,212.0,168.0,98.0,29.77,72.0,75.0,0
2,46,F,YES,10.0,0.0,0,0,0,250.0,116.0,71.0,20.35,88.0,94.0,0
3,50,M,YES,20.0,0.0,0,1,0,233.0,158.0,88.0,28.26,68.0,94.0,1
4,64,F,YES,30.0,0.0,0,0,0,241.0,136.5,85.0,26.42,70.0,77.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3385,60,F,NO,0.0,0.0,0,0,0,261.0,123.5,79.0,29.28,70.0,103.0,0
3386,46,F,NO,0.0,0.0,0,0,0,199.0,102.0,56.0,21.96,80.0,84.0,0
3387,44,M,YES,3.0,0.0,0,1,0,352.0,164.0,119.0,28.92,73.0,72.0,1
3388,60,M,NO,0.0,,0,1,0,191.0,167.0,105.0,23.01,80.0,85.0,0


In [144]:
print(df.isnull().sum().sum())
df=df.dropna()
print(df.isnull().sum().sum())
df.shape #REVEERLO buscar manera de balancear los datos

423
0


(3004, 15)

In [145]:
df.isna().sum()

age                0
sex                0
is_smoking         0
cigsPerDay         0
BPMeds             0
prevalentStroke    0
prevalentHyp       0
diabetes           0
totChol            0
sysBP              0
diaBP              0
BMI                0
heartRate          0
glucose            0
TenYearCHD         0
dtype: int64

In [146]:
# Checking if columns have empty/non-valid values
for col in df.columns:
  print('-----------------------------------------') #buscar si se puede mostrar mejor esto 
  print(df[col].value_counts())

-----------------------------------------
age
40    132
42    132
46    130
41    125
39    121
48    120
44    118
45    117
43    108
38    106
52    104
50    103
55    102
51    102
53    100
54     97
49     93
47     90
57     86
63     85
60     85
56     84
59     78
58     78
61     77
62     74
36     69
64     68
37     67
65     36
67     28
66     27
35     25
34     14
68     13
33      4
69      4
70      1
32      1
Name: count, dtype: int64
-----------------------------------------
sex
F    1657
M    1347
Name: count, dtype: Int64
-----------------------------------------
is_smoking
NO     1524
YES    1480
Name: count, dtype: Int64
-----------------------------------------
cigsPerDay
0.0     1524
20.0     548
30.0     161
15.0     150
5.0       91
10.0      89
9.0       85
3.0       69
40.0      55
1.0       45
25.0      39
43.0      38
2.0       15
35.0      14
6.0       13
8.0        8
7.0        8
60.0       7
4.0        7
18.0       6
17.0       5
23.0       5
50.0

#### Tratamiento de variables categoricas

In [147]:
df['sex'] = df['sex'].map({'F': 1, 'M': 0}) 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sex'] = df['sex'].map({'F': 1, 'M': 0})


In [148]:
df['is_smoking'] = df['is_smoking'].map({'YES': 1, 'NO': 0})
df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['is_smoking'] = df['is_smoking'].map({'YES': 1, 'NO': 0})


Unnamed: 0_level_0,age,sex,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,36,0,0,0.0,0.0,0,1,0,212.0,168.0,98.0,29.77,72.0,75.0,0
2,46,1,1,10.0,0.0,0,0,0,250.0,116.0,71.0,20.35,88.0,94.0,0
3,50,0,1,20.0,0.0,0,1,0,233.0,158.0,88.0,28.26,68.0,94.0,1
4,64,1,1,30.0,0.0,0,0,0,241.0,136.5,85.0,26.42,70.0,77.0,0
5,61,1,0,0.0,0.0,0,1,0,272.0,182.0,121.0,32.8,85.0,65.0,1


In [149]:
df['BPMeds']=df['BPMeds'].fillna(0) 
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['BPMeds']=df['BPMeds'].fillna(0)


Unnamed: 0_level_0,age,sex,is_smoking,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,36,0,0,0.0,0.0,0,1,0,212.0,168.0,98.0,29.77,72.0,75.0,0
2,46,1,1,10.0,0.0,0,0,0,250.0,116.0,71.0,20.35,88.0,94.0,0
3,50,0,1,20.0,0.0,0,1,0,233.0,158.0,88.0,28.26,68.0,94.0,1
4,64,1,1,30.0,0.0,0,0,0,241.0,136.5,85.0,26.42,70.0,77.0,0
5,61,1,0,0.0,0.0,0,1,0,272.0,182.0,121.0,32.80,85.0,65.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3384,61,1,0,0.0,0.0,0,1,0,217.0,182.0,86.0,26.98,105.0,113.0,0
3385,60,1,0,0.0,0.0,0,0,0,261.0,123.5,79.0,29.28,70.0,103.0,0
3386,46,1,0,0.0,0.0,0,0,0,199.0,102.0,56.0,21.96,80.0,84.0,0
3387,44,0,1,3.0,0.0,0,1,0,352.0,164.0,119.0,28.92,73.0,72.0,1


In [113]:
df['BPMeds'] = df['BPMeds'].astype(int) #ARREGLAR, PASAR A 0 Y 1
# print(dtype(df['BPMeds']))
# df.head(5)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer