In [1]:
#Importing required libraries
import numpy as np
import pandas as pd

In [37]:
# Reading in the data using read_csv method
df = pd.read_csv('car-sales-extended.csv')
df.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043
5,Honda,Red,42652,4,23883
6,Toyota,Blue,163453,4,8473
7,Honda,White,43120,4,20306
8,Nissan,White,130538,4,9374
9,Honda,Blue,51029,4,26683


In [6]:
# Check for missing values
df.isnull().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [38]:
# intiating X and Y
x = df.drop('Price',1)
y = df['Price']

In [39]:
df['Price'].dtype

dtype('int64')

In [40]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [41]:
x.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors'], dtype='object')

In [42]:
df['Doors'].value_counts()

4    856
5     79
3     65
Name: Doors, dtype: int64

In [43]:
# Onehotencoder and columntranformer to convert categorical data into encoded data
onehot = OneHotEncoder()
cats = ['Make','Colour','Doors']

transformer = ColumnTransformer([("onehot",
                               onehot,
                               cats)],
                               remainder='passthrough')

In [44]:
tranx = transformer.fit_transform(x)

In [45]:
x.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3
5,Honda,Red,42652,4
6,Toyota,Blue,163453,4
7,Honda,White,43120,4
8,Nissan,White,130538,4
9,Honda,Blue,51029,4


In [46]:
tranx

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [52]:
# splitting data into training and test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(tranx,y,test_size=0.2,random_state=42)

In [53]:
x_train

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.12004e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.56730e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.46824e+05],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.18760e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.86250e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]])

In [56]:
# regression model for this project
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(100)

In [57]:
model.fit(x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [59]:
preds=model.predict(x_test)

In [60]:
# looks like pretty good score ......
model.score(x_test,y_test)

0.30491084565475046

## Handling missing data

In [20]:
df = pd.read_csv('car-sales-extended-missing-data.csv')

In [3]:
df.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
5,Honda,Red,42652.0,4.0,23883.0
6,Toyota,Blue,163453.0,4.0,8473.0
7,Honda,White,,4.0,20306.0
8,,White,130538.0,4.0,9374.0
9,Honda,Blue,51029.0,4.0,26683.0


In [7]:
df.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [10]:
X = df.drop('Price',1)
y = df['Price']

In [12]:
df.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [14]:
df['Make'].isnull().sum()

49

In [30]:
# filling missing data in make and colours columns with "missing" and with the mean of odometer column for Odometer column
df['Make'].fillna('missing',inplace=True)
df['Colour'].fillna('missing',inplace=True)
df['Doors'].fillna(4,inplace=True)
df['Odometer (KM)'].fillna(df['Odometer (KM)'].mean(),inplace=True)

In [38]:
df.isnull().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [37]:
df.dropna(inplace=True)

In [51]:
#converting categorical feaatures into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#importing done now the actual work
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                   one_hot,
                                   categorical_features)],
                                   remainder="passthrough")
transx = transformer.fit_transform(X)
transx

<1000x15 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

## Filling missing data using sci-kit learn

In [98]:
df = pd.read_csv('car-sales-extended-missing-data.csv')

In [56]:
df.head(10)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
5,Honda,Red,42652.0,4.0,23883.0
6,Toyota,Blue,163453.0,4.0,8473.0
7,Honda,White,,4.0,20306.0
8,,White,130538.0,4.0,9374.0
9,Honda,Blue,51029.0,4.0,26683.0


In [99]:
# Check for missing data

df.isnull().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [58]:
# lot many!!!!

In [101]:
#splitting data into dependent(X) and independent(y)
X = df.drop('Price',1)
y = df.Price

In [102]:
#splitting data into training and and testing data
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [103]:
# Filling the missing values using sci-kit learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Initialize the imputer function
cate = SimpleImputer(strategy='constant',fill_value='missing')
doors = SimpleImputer(strategy='constant',fill_value=4)
numbs = SimpleImputer(strategy='mean')

# defining the vat
cat_f = ['Make','Colour']
door = ['Doors']
num = ['Odometer (KM)']

# This is going to fill in the missing data
imputer = ColumnTransformer([
    ("cat_imputer", cate,cat_f),
    ('doors_imputer',doors,door),
    ('num_imp',numbs,num)
])

# fitting and tranforming X
x_train = imputer.fit_transform(x_train)
x_test = imputer.transform(x_test)

In [104]:
x_train

array([['Honda', 'Red', 4.0, 101835.0],
       ['Nissan', 'White', 3.0, 149347.0],
       ['Nissan', 'Blue', 3.0, 197823.0],
       ...,
       ['BMW', 'White', 5.0, 16274.0],
       ['Nissan', 'White', 4.0, 209259.0],
       ['Toyota', 'White', 4.0, 202435.0]], dtype=object)

In [105]:
# Now the x train nd x_test are in nd.array , should convert it to DataFrame!!

training_data = pd.DataFrame(x_train,
                            columns=['Make','Colour','Doors','Odometer (KM)'])

testing_data = pd.DataFrame(x_test,
                           columns=['Make','Colour','Doors','Odometer (KM)'])

In [106]:
training_data.head(3)

Unnamed: 0,Make,Colour,Doors,Odometer (KM)
0,Honda,Red,4,101835
1,Nissan,White,3,149347
2,Nissan,Blue,3,197823


In [107]:
training_data.isnull().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [108]:
#Encoding the data
from sklearn.preprocessing import OneHotEncoder
one_hot = OneHotEncoder()

cats = ['Make','Doors','Colour']
trans = ColumnTransformer([('one-hot',
                          one_hot,
                          cats)],
                         remainder='passthrough')


final_x_train = trans.fit_transform(training_data)
final_x_test = trans.fit_transform(testing_data)

In [109]:
final_x_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.01835e+05],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.49347e+05],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.97823e+05],
       ...,
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62740e+04],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.09259e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.02435e+05]])

In [110]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100)
model.fit(final_x_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [100]:
#Dropping all the data points with no label
df.dropna(subset=["Price"], inplace=True)
df.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [111]:
model.score(final_x_test,y_test)

0.2890294296872361

## Choosing the best ML model