# Practicing imputation with diabetes dataset 

In [1]:
#Importing the classics
import pandas as pd 
import numpy as np 

#Importing sklearn tools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.impute import SimpleImputer, MissingIndicator

In [2]:
df = pd.read_csv('Mock_CAR_GENDER_MAKE_PRICE.csv', header = 0, index_col = 0)
df.shape

(1000, 3)

In [3]:
df.carmake = df.carmake.replace('\s+', '',regex=True)
df.carmake = df.carmake.str.lower()
df.head(20)

Unnamed: 0_level_0,gender,carmake,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Male,honda,16907.46
2,Female,chevrolet,25298.09
3,Female,mazda,21812.35
4,Male,honda,22583.6
5,Female,astonmartin,28987.44
6,Male,mercury,24213.82
7,Male,chevrolet,29778.91
8,Female,chevrolet,28993.37
9,Female,nissan,21563.51
10,Male,mercury,23162.97


Some of the columns above, contain values of NAN, specificaly the column price. We will change the NAN values to zero and perfom linear discriminany analysis anyway to see how well it does.

In [4]:
#Replace NAN with zeros 
df[['price']] = df[['price']].replace(np.NaN, 0)

#Need more preprocessing 
#Relabel sex
df[['gender']] = df[['gender']].replace({'Female':1,  'Male':0})

In [5]:
#carmake 
tfidf = TfidfVectorizer()
carmake_values = tfidf.fit_transform(df.carmake).toarray()

In [6]:
col_names = tfidf.get_feature_names()
carmake_values_df = pd.DataFrame(data = carmake_values, columns = col_names) 

In [7]:
carmake_values_df.head()

Unnamed: 0,acura,astonmartin,audi,bentley,benz,bmw,buick,cadillac,chevrolet,chrysler,...,saab,saturn,scion,smart,subaru,suzuki,tesla,toyota,volkswagen,volvo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
result = pd.concat([df, carmake_values_df], axis=1, join='inner')
result = result.drop('carmake', axis = 1)
result.head()

Unnamed: 0,gender,price,acura,astonmartin,audi,bentley,benz,bmw,buick,cadillac,...,saab,saturn,scion,smart,subaru,suzuki,tesla,toyota,volkswagen,volvo
1,0,16907.46,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1,25298.09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,21812.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,22583.6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,1,28987.44,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
result = result[[c for c in result if c not in ['price']] + ['price']]
print(result.head())
result.shape

   gender  acura  astonmartin  audi  bentley  benz  bmw  buick  cadillac  \
1       0    0.0          0.0   0.0      0.0   0.0  0.0    0.0       0.0   
2       1    0.0          0.0   0.0      0.0   0.0  0.0    0.0       0.0   
3       1    0.0          0.0   0.0      0.0   0.0  0.0    0.0       0.0   
4       0    0.0          1.0   0.0      0.0   0.0  0.0    0.0       0.0   
5       1    0.0          0.0   0.0      0.0   0.0  0.0    0.0       0.0   

   chevrolet    ...     saturn  scion  smart  subaru  suzuki  tesla  toyota  \
1        1.0    ...        0.0    0.0    0.0     0.0     0.0    0.0     0.0   
2        0.0    ...        0.0    0.0    0.0     0.0     0.0    0.0     0.0   
3        0.0    ...        0.0    0.0    0.0     0.0     0.0    0.0     0.0   
4        0.0    ...        0.0    0.0    0.0     0.0     0.0    0.0     0.0   
5        0.0    ...        0.0    0.0    0.0     0.0     0.0    0.0     0.0   

   volkswagen  volvo     price  
1         0.0    0.0  16907.46  
2 

(999, 59)

In [10]:
# split dataset into inputs and outputs
values = result.values
X = values[:,0:58]
y = values[:,58]
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
# evaluate an LDA model on the dataset using k-fold cross validation
#model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)

In [11]:
result = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
result.mean()



ValueError: Unknown label type: 'continuous'

77.3% is not bad but lets see if we can make it any better. 

## Preprocessing 

Insted of leaving the obviosly wrong zeros as they are, we will rename these as NAN.

In [None]:
result[['price']] = result[['price']].replace(0, np.NaN)
result.head(15)

## 'Imputation' method 1. Remove Rows With Missing Values

In [None]:
df1 = result.dropna(inplace=False)
df1.shape

In [None]:
# split dataset into inputs and outputs
values = df1.values
X = values[:,0:58]
y = values[:,58]
# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
result.mean()

78.4%... slightly better 

## Imputation method 2. Replacing cells with missing values with mean of column 

In [None]:
# split dataset into inputs and outputs
values = df.values
X = values[:,0:8]
y = values[:,8]
# fill missing values with mean column values
imputer = SimpleImputer(strategy='mean')
transformed_X = imputer.fit_transform(X)
# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, transformed_X, y, cv=kfold, scoring='accuracy')
print(result.mean())

76.4%... slightly worse than both leaving the data with zeros and removing all rows with nans

## Imputation method 3. Replacing cells with missing values with median of column

In [None]:
# split dataset into inputs and outputs
values = df.values
X = values[:,0:8]
y = values[:,8]
# fill missing values with mean column values
imputer = SimpleImputer(strategy='median')
transformed_X = imputer.fit_transform(X)
# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, transformed_X, y, cv=kfold, scoring='accuracy')
print(result.mean())

76.4%... The same as for the mean method 

## Imputation method 4. Replacing cells with missing values with mode of column

In [None]:
# split dataset into inputs and outputs
values = df.values
X = values[:,0:8]
y = values[:,8]
# fill missing values with mean column values
imputer = SimpleImputer(strategy='most_frequent')
transformed_X = imputer.fit_transform(X)
# evaluate an LDA model on the dataset using k-fold cross validation
model = LinearDiscriminantAnalysis()
kfold = KFold(n_splits=3, random_state=7)
result = cross_val_score(model, transformed_X, y, cv=kfold, scoring='accuracy')
print(result.mean())

76.2%... Even worse. 