# Imputing missing values
https://www.kaggle.com/datasets/uciml/horse-colic

https://archive.ics.uci.edu/ml/datasets/Horse+Colic

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('horse-colic.csv', header=None,)
df.head().T

Unnamed: 0,0,1,2,3,4
0,2,1,2,1,2
1,1,1,1,9,1
2,530101,534817,530334,5290409,530255
3,38.50,39.2,38.30,39.10,37.30
4,66,88,40,164,104
5,28,20,24,84,35
6,3,?,1,4,?
7,3,?,1,1,?
8,?,4,3,6,6
9,2,1,1,2,2


When the data is read missing values of `?` can be replaced with `NaN`

In [3]:
df = pd.read_csv('horse-colic.csv', header=None, na_values='?')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2.0,1,530101,38.5,66.0,28.0,3.0,3.0,,2.0,...,45.0,8.4,,,2.0,2,11300,0,0,2
1,1.0,1,534817,39.2,88.0,20.0,,,4.0,1.0,...,50.0,85.0,2.0,2.0,3.0,2,2208,0,0,2
2,2.0,1,530334,38.3,40.0,24.0,1.0,1.0,3.0,1.0,...,33.0,6.7,,,1.0,2,0,0,0,1
3,1.0,9,5290409,39.1,164.0,84.0,4.0,1.0,6.0,2.0,...,48.0,7.2,3.0,5.3,2.0,1,2208,0,0,1
4,2.0,1,530255,37.3,104.0,35.0,,,6.0,2.0,...,74.0,7.4,,,2.0,2,4300,0,0,2


In [None]:
df.isna().sum()

0       1
1       0
2       0
3      60
4      24
5      58
6      56
7      69
8      47
9      32
10     55
11     44
12     56
13    104
14    106
15    247
16    102
17    118
18     29
19     33
20    165
21    198
22      1
23      0
24      0
25      0
26      0
27      0
dtype: int64

Determining the percentage of missing values

In [4]:
pd.concat([df.isna().sum(), df.isna().sum()/len(df)*100], axis=1).head(10)

Unnamed: 0,0,1
0,1,0.333333
1,0,0.0
2,0,0.0
3,60,20.0
4,24,8.0
5,58,19.333333
6,56,18.666667
7,69,23.0
8,47,15.666667
9,32,10.666667


In [6]:
target = 23
features = list(df.columns)
features.remove(target)

In [7]:
X = df[features]
y = df[target]

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
imputer.fit(X)

In [8]:
Ximputed = imputer.transform(X)

In [9]:
sum(np.isnan(Ximputed).flatten())

0

In [14]:
pd.DataFrame(Ximputed, index=X.index, columns=X.columns)#.isna().sum()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,24,25,26,27
0,2.0,1.0,530101.0,38.500000,66.0,28.0,3.000000,3.000000,2.853755,2.00000,...,5.000000,45.0,8.400000,2.037037,3.019608,2.0,11300.0,0.0,0.0,2.0
1,1.0,1.0,534817.0,39.200000,88.0,20.0,2.348361,2.017316,4.000000,1.00000,...,2.000000,50.0,85.000000,2.000000,2.000000,3.0,2208.0,0.0,0.0,2.0
2,2.0,1.0,530334.0,38.300000,40.0,24.0,1.000000,1.000000,3.000000,1.00000,...,1.000000,33.0,6.700000,2.037037,3.019608,1.0,0.0,0.0,0.0,1.0
3,1.0,9.0,5290409.0,39.100000,164.0,84.0,4.000000,1.000000,6.000000,2.00000,...,3.692308,48.0,7.200000,3.000000,5.300000,2.0,2208.0,0.0,0.0,1.0
4,2.0,1.0,530255.0,37.300000,104.0,35.0,2.348361,2.017316,6.000000,2.00000,...,3.692308,74.0,7.400000,2.037037,3.019608,2.0,4300.0,0.0,0.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1.0,1.0,533886.0,38.167917,120.0,70.0,4.000000,2.017316,4.000000,2.00000,...,5.000000,55.0,65.000000,2.037037,3.019608,3.0,3205.0,0.0,0.0,2.0
296,2.0,1.0,527702.0,37.200000,72.0,24.0,3.000000,2.000000,4.000000,2.00000,...,4.000000,44.0,24.456929,3.000000,3.300000,3.0,2208.0,0.0,0.0,1.0
297,1.0,1.0,529386.0,37.500000,72.0,30.0,4.000000,3.000000,4.000000,1.00000,...,5.000000,60.0,6.800000,2.037037,3.019608,2.0,3205.0,0.0,0.0,2.0
298,1.0,1.0,530612.0,36.500000,100.0,24.0,3.000000,3.000000,3.000000,1.00000,...,4.000000,50.0,6.000000,3.000000,3.400000,1.0,2208.0,0.0,0.0,1.0


# What is wrong with the above?