### Making data numerical
____

In [1]:
# import libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import r2_score

In [2]:
# load the dataset
cars = pd.read_csv('../Data/carprice.csv', index_col=0)
cars.head()

Unnamed: 0,Type,Min.Price,Price,Max.Price,Range.Price,RoughRange,gpm100,MPG.city,MPG.highway
6,Midsize,14.2,15.7,17.3,3.1,3.09,3.8,22,31
7,Large,19.9,20.8,21.7,1.8,1.79,4.2,19,28
8,Large,22.6,23.7,24.9,2.3,2.31,4.9,16,25
9,Midsize,26.3,26.3,26.3,0.0,-0.01,4.3,19,27
10,Large,33.0,34.7,36.3,3.3,3.3,4.9,16,25


In [3]:
# check the data types
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48 entries, 6 to 79
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Type         48 non-null     object 
 1   Min.Price    48 non-null     float64
 2   Price        48 non-null     float64
 3   Max.Price    48 non-null     float64
 4   Range.Price  48 non-null     float64
 5   RoughRange   48 non-null     float64
 6   gpm100       48 non-null     float64
 7   MPG.city     48 non-null     int64  
 8   MPG.highway  48 non-null     int64  
dtypes: float64(6), int64(2), object(1)
memory usage: 3.8+ KB


In [4]:
cars.isnull().any()

Type           False
Min.Price      False
Price          False
Max.Price      False
Range.Price    False
RoughRange     False
gpm100         False
MPG.city       False
MPG.highway    False
dtype: bool

In [5]:
# separate the data and target
X = cars.drop('Price', axis=1)
y = cars[['Price']]
X.shape, y.shape

((48, 8), (48, 1))

In [6]:
# tranform X into numerical data
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, ['Type'])], remainder='passthrough')
X_transformed = transformer.fit_transform(X)
pd.DataFrame(X_transformed).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,0.0,1.0,0.0,0.0,0.0,14.2,17.3,3.1,3.09,3.8,22.0,31.0
1,0.0,1.0,0.0,0.0,0.0,0.0,19.9,21.7,1.8,1.79,4.2,19.0,28.0
2,0.0,1.0,0.0,0.0,0.0,0.0,22.6,24.9,2.3,2.31,4.9,16.0,25.0
3,0.0,0.0,1.0,0.0,0.0,0.0,26.3,26.3,0.0,-0.01,4.3,19.0,27.0
4,0.0,1.0,0.0,0.0,0.0,0.0,33.0,36.3,3.3,3.3,4.9,16.0,25.0


In [7]:
# split the data
X_train, y_train, X_test, y_test = train_test_split(X_transformed, y, test_size=0.4, random_state=42)

In [8]:
# fit and score the model
clf = RandomForestRegressor().fit(X_train.T, y_train.T)
clf.score(X_train.T, y_train.T)

0.9787383899817865

In [9]:
# score the model on the test data
r2_score(clf.predict(X_test.T).reshape(-1,1), y_test)

0.92189734404879

<br>

___
### End.