In [1]:
from IPython.display import display
import numpy as np
import pandas as pd

### 0. DATA PREPARATION

In [2]:
df = pd.read_csv("car_fuel_efficiency.txt")
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [3]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [4]:
# Fill missing values with zeros.
df = df.fillna(0.0)

In [5]:
df.isnull().sum()

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64

#### 0.1 Splitting the DataFrame using the train_test_split function

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [7]:
train_full_df, test_df = train_test_split(df, test_size = 0.2, random_state = 1)
train_df, validation_df = train_test_split(train_full_df, test_size = 0.25, random_state = 1)

len(train_df), len(validation_df), len(test_df)

(5822, 1941, 1941)

In [8]:
# Removing the target variable from the datasets. 
y_train = train_df.fuel_efficiency_mpg.values
y_validation = validation_df.fuel_efficiency_mpg.values
y_test = test_df.fuel_efficiency_mpg.values

del train_df['fuel_efficiency_mpg']
del validation_df['fuel_efficiency_mpg']
del test_df['fuel_efficiency_mpg']

#### 0.2 Using DictVectorizer(sparse=True) to turn the dataframes into matrices {X_train, X_validation}.

In [9]:
train_dicts = train_df.to_dict(orient = 'records')
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)

validation_dicts = validation_df.to_dict(orient = 'records')
X_validation = dv.transform(validation_dicts)
feature_matrix = dv.feature_names_

### QUESTION 1: TRAINING A DECISION TREE REGRESSOR

In [10]:
from sklearn.tree import DecisionTreeRegressor, export_text

In [11]:
treeRegressor = DecisionTreeRegressor(max_depth = 1)
treeRegressor.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [12]:
y_prediction = treeRegressor.predict(X_validation)

In [13]:
print(export_text(treeRegressor, feature_names = feature_matrix))

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



### QUESTION 2: TRAIN A RANDOM FOREST REGRESSOR

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
forestRegressor = RandomForestRegressor(n_estimators = 10, random_state = 1, n_jobs = -1)
forestRegressor.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
from sklearn.metrics import root_mean_squared_error

In [17]:
y_forestPrediction = forestRegressor.predict(X_validation)

rmse = root_mean_squared_error(y_validation, y_forestPrediction)
print(f"The RMSE on the validation set is: {rmse}")

The RMSE on the validation set is: 0.45957772230927263


### QUESTION 3: N_ESTIMATORS

In [18]:
estimators = [num for num in range(10, 201, 10)]

for estimator in estimators:
    randomForest = RandomForestRegressor(n_estimators = estimator, random_state = 1)
    randomForest.fit(X_train, y_train)

    y_randomPrediction = randomForest.predict(X_validation)
    random_rmse = root_mean_squared_error(y_validation, y_randomPrediction)
    print(f"estimator: {estimator} => random_rmse: {random_rmse}")

estimator: 10 => random_rmse: 0.4595777223092726
estimator: 20 => random_rmse: 0.45359067251247054
estimator: 30 => random_rmse: 0.45168672575457125
estimator: 40 => random_rmse: 0.4487208301736997
estimator: 50 => random_rmse: 0.4466568972416094
estimator: 60 => random_rmse: 0.44545970260811213
estimator: 70 => random_rmse: 0.4451263244986996
estimator: 80 => random_rmse: 0.4449843119777284
estimator: 90 => random_rmse: 0.4448614906399875
estimator: 100 => random_rmse: 0.4446518680868042
estimator: 110 => random_rmse: 0.44357876439860233
estimator: 120 => random_rmse: 0.4439118681233817
estimator: 130 => random_rmse: 0.443702590396687
estimator: 140 => random_rmse: 0.4433549955101688
estimator: 150 => random_rmse: 0.44289761494219454
estimator: 160 => random_rmse: 0.4427612219659299
estimator: 170 => random_rmse: 0.44280146504730905
estimator: 180 => random_rmse: 0.44236195357041347
estimator: 190 => random_rmse: 0.4424939711220692
estimator: 200 => random_rmse: 0.4424785084688597


### QUESTION 4: max_depth

In [19]:
max_depths = [10, 15, 20, 25]
estimators = [num for num in range(10, 201, 10)]
scores = []

for max_depth in max_depths:
    for estimator in estimators:
        randomForest = RandomForestRegressor(n_estimators = estimator, max_depth = max_depth, random_state = 1, n_jobs = -1)
        randomForest.fit(X_train, y_train)
    
        y_randomPrediction = randomForest.predict(X_validation)
        random_rmse = root_mean_squared_error(y_validation, y_randomPrediction)
        scores.append(random_rmse)
        
    print(f"max_depth : {max_depth} => random_rmse: {random_rmse}")

max_depth : 10 => random_rmse: 0.4398451062550146
max_depth : 15 => random_rmse: 0.4423455848136944
max_depth : 20 => random_rmse: 0.4429049871977397
max_depth : 25 => random_rmse: 0.4424794105563504


### QUESTION 5: feature importance

In [34]:
from sklearn.inspection import permutation_importance

In [35]:
randomFR = RandomForestRegressor(n_estimators = 10, max_depth = 20, random_state = 1, n_jobs = -1)
randomFR.fit(X_train, y_train)

0,1,2
,n_estimators,10
,criterion,'squared_error'
,max_depth,20
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [36]:
feature_importance = randomForest.feature_importances_
standard_deviation = np.std([tree.feature_importances_ for tree in randomForest.estimators_], axis = 0)

perm_importance = permutation_importance(randomForest, X_train.columns, y_train.columns, n_repeats = 10, random_state = 1, n_jobs = -1)
importance_mean = perm_importance.importances_mean
importance_std = perm_importance.importances_std

AttributeError: 'csr_matrix' object has no attribute 'columns'

In [None]:
feature_names = df.columns
pd.DataFrame(feature_names)

### QUESTION 6: XGBoost

In [37]:
import xgboost as xgb

In [43]:
dTrain = xgb.DMatrix(X_train, label = y_train, feature_names = feature_matrix)
dValidation = xgb.DMatrix(X_validation, label = y_validation, feature_names = feature_matrix)

In [44]:
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    
    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dTrain, 'train'), (dValidation, 'validation')]
model = xgb.train(xgb_params, dTrain, num_boost_round = 100, evals = watchlist, verbose_eval = 10)

[0]	train-auc:nan	validation-auc:nan


  score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)


[10]	train-auc:nan	validation-auc:nan
[20]	train-auc:nan	validation-auc:nan
[30]	train-auc:nan	validation-auc:nan
[40]	train-auc:nan	validation-auc:nan
[50]	train-auc:nan	validation-auc:nan
[60]	train-auc:nan	validation-auc:nan
[70]	train-auc:nan	validation-auc:nan
[80]	train-auc:nan	validation-auc:nan
[90]	train-auc:nan	validation-auc:nan
[99]	train-auc:nan	validation-auc:nan


In [45]:
xgb_params = {
    'eta': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    'eval_metric': 'auc',
    
    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dTrain, 'train'), (dValidation, 'validation')]
model = xgb.train(xgb_params, dTrain, num_boost_round = 100, evals = watchlist, verbose_eval = 10)

[0]	train-auc:nan	validation-auc:nan
[10]	train-auc:nan	validation-auc:nan
[20]	train-auc:nan	validation-auc:nan


  score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)


[30]	train-auc:nan	validation-auc:nan
[40]	train-auc:nan	validation-auc:nan
[50]	train-auc:nan	validation-auc:nan
[60]	train-auc:nan	validation-auc:nan
[70]	train-auc:nan	validation-auc:nan
[80]	train-auc:nan	validation-auc:nan
[90]	train-auc:nan	validation-auc:nan
[99]	train-auc:nan	validation-auc:nan
