In [1]:
import numpy as np
import pandas as pd

In [2]:
#Fill value
#Remove missing data

In [3]:
data = pd.read_html('https://github.com/mrdbourke/zero-to-mastery-ml/blob/master/data/car-sales-extended-missing-data.csv')

In [5]:
car_sale = data[0].iloc[:,1:]

In [6]:
car_sale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [8]:
car_sale.isna().sum() #All missing values

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [19]:
#Filling missing data with pandas

In [20]:
car_data = car_sale.copy()

In [21]:
car_data['Make'].fillna('missing',inplace=True)

In [22]:
car_data['Colour'].fillna('missing',inplace=True)

In [23]:
car_data['Odometer (KM)'].fillna(car_data['Odometer (KM)'].mean(),inplace=True)

In [24]:
car_data['Doors'].fillna(4,inplace=True)

In [25]:
car_data.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [26]:
## Remove Rows with car missing values

In [27]:
 car_data.shape

(1000, 5)

In [28]:
car_data.dropna(inplace=True)

In [29]:
car_data.shape

(950, 5)

In [30]:
car_data.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [31]:
#Creating Feature and labels with new data

In [99]:
feature_x = car_data.drop('Price',axis=1)

In [100]:
feature_x

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0
...,...,...,...,...
995,Toyota,Black,35820.0,4.0
996,missing,White,155144.0,3.0
997,Nissan,Blue,66604.0,4.0
998,Honda,White,215883.0,4.0


In [101]:
label_y = car_data['Price']

In [102]:
label_y

0      15323.0
1      19943.0
2      28343.0
3      13434.0
4      14043.0
        ...   
995    32042.0
996     5716.0
997    31570.0
998     4001.0
999    12732.0
Name: Price, Length: 950, dtype: float64

# 

In [36]:
# Convert to numeric

In [10]:
from sklearn.preprocessing import OneHotEncoder

In [11]:
from sklearn.compose import ColumnTransformer

In [12]:
categorical_feature = ['Make','Colour','Doors']

In [103]:
one_hot = OneHotEncoder()
transformer = ColumnTransformer( [('one_hot',
                                 one_hot,categorical_feature)],
                               remainder ='passthrough')

transformed_x = transformer.fit_transform(feature_x)
transformed_x

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [43]:
################ Filling Missing values with Scikit

In [44]:
car_sale.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           951 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 39.2+ KB


In [45]:
car_sale.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [48]:
car_data2 = car_sale.copy()

In [46]:
#['Price'] is the label column

In [47]:
#Dropping all the missing values that is related with the label ['Price']

In [49]:
car_data2.dropna(subset=['Price'],inplace=True)


In [50]:
car_data2.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [51]:
# Creating Feature and labels

In [58]:
feature_x2 = car_data2.drop('Price',axis=1)
label_y2 = car_data2['Price']

In [53]:
#########

In [54]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

In [55]:
#fill values

In [56]:
category_imputer = SimpleImputer(strategy='constant',fill_value='missing')
door_imputer = SimpleImputer(strategy='constant',fill_value=4)
numeric_imputer = SimpleImputer(strategy='mean')


In [60]:
#Define columns
category_feature = ['Make','Colour']
door_feature = ['Doors']
numeric_feature = ['Odometer (KM)']


In [61]:
imputer = ColumnTransformer([
    ('category_imputer',category_imputer,category_feature),
    ('door_imputer',door_imputer,door_feature),
    ('numeric_imputer',numeric_imputer,numeric_feature)
])

#transform data
filled_x2 = imputer.fit_transform(feature_x2)
filled_x2

array([['Honda', 'White', 4.0, 35431.0],
       ['BMW', 'Blue', 5.0, 192714.0],
       ['Honda', 'White', 4.0, 84714.0],
       ...,
       ['Nissan', 'Blue', 4.0, 66604.0],
       ['Honda', 'White', 4.0, 215883.0],
       ['Toyota', 'Blue', 4.0, 248360.0]], dtype=object)

In [62]:
#creating a dataFrame using new filled_x2 values

In [64]:
feature_x2.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors'], dtype='object')

In [65]:
filled_data2 = pd.DataFrame(filled_x2,columns=['Make', 'Colour', 'Doors', 'Odometer'])

In [66]:
filled_data2.head()

Unnamed: 0,Make,Colour,Doors,Odometer
0,Honda,White,4,35431
1,BMW,Blue,5,192714
2,Honda,White,4,84714
3,Toyota,White,4,154365
4,Nissan,Blue,3,181577


In [67]:
filled_data2.isna().sum()

Make        0
Colour      0
Doors       0
Odometer    0
dtype: int64

In [68]:
filled_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Make      950 non-null    object
 1   Colour    950 non-null    object
 2   Doors     950 non-null    object
 3   Odometer  950 non-null    object
dtypes: object(4)
memory usage: 29.8+ KB


In [89]:
filled_data2['Odometer'] = filled_data2['Odometer'].astype(float)

In [90]:
filled_data2.head()

Unnamed: 0,Make,Colour,Doors,Odometer
0,Honda,White,4,35431.0
1,BMW,Blue,5,192714.0
2,Honda,White,4,84714.0
3,Toyota,White,4,154365.0
4,Nissan,Blue,3,181577.0


In [91]:
filled_data2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950 entries, 0 to 949
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Make      950 non-null    object 
 1   Colour    950 non-null    object 
 2   Doors     950 non-null    object 
 3   Odometer  950 non-null    float64
dtypes: float64(1), object(3)
memory usage: 29.8+ KB


In [72]:
label_y2.shape

(950,)

In [71]:
filled_data2.shape

(950, 4)

In [70]:
#converting category to numeric

In [73]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [74]:
categorical_feature = ['Make','Colour','Doors']
one_hot = OneHotEncoder()
transformer = ColumnTransformer([
    ('one_hot',
    one_hot,
    categorical_feature,
    )
], remainder='passthrough')

In [92]:
transformed_x2 = transformer.fit_transform(filled_data2)

In [93]:
transformed_x2

<950x15 sparse matrix of type '<class 'numpy.float64'>'
	with 3800 stored elements in Compressed Sparse Row format>

In [77]:
## Fitting the model

In [94]:
np.random.seed(1)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(transformed_x2,label_y2,test_size= 0.2)

model2 = RandomForestRegressor()
model2.fit(x_train,y_train) #fitting the model
model2.score(x_test,y_test) #accuracy test

0.25158720734485374

In [104]:
x2_train,x2_test,y2_train,y2_test = train_test_split(transformed_x,label_y,test_size=0.2)
## pandas missing

model1 = RandomForestRegressor()
model1.fit(x2_train,y2_train)
model1.score(x2_test,y2_test)

0.2340652782245759

In [86]:
#################################################################

In [105]:
car_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 950 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           950 non-null    object 
 1   Colour         950 non-null    object 
 2   Odometer (KM)  950 non-null    float64
 3   Doors          950 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 44.5+ KB


In [106]:
car_data2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 950 entries, 0 to 999
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Make           903 non-null    object 
 1   Colour         904 non-null    object 
 2   Odometer (KM)  902 non-null    float64
 3   Doors          903 non-null    float64
 4   Price          950 non-null    float64
dtypes: float64(3), object(2)
memory usage: 44.5+ KB
