In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

df = pd.read_csv('mpg_clean.csv')
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,usa,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,usa,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,usa,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,usa,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,usa,ford torino


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
 7   origin        392 non-null    object 
 8   name          392 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 27.7+ KB


In [3]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')

In [4]:
df.value_counts('origin')

origin
usa       245
japan      79
europe     68
Name: count, dtype: int64

In [5]:
df.describe()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,23.445918,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592
std,7.805007,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737
min,9.0,3.0,68.0,46.0,1613.0,8.0,70.0
25%,17.0,4.0,105.0,75.0,2225.25,13.775,73.0
50%,22.75,4.0,151.0,93.5,2803.5,15.5,76.0
75%,29.0,8.0,275.75,126.0,3614.75,17.025,79.0
max,46.6,8.0,455.0,230.0,5140.0,24.8,82.0


In [6]:
# Create a dataframe X containing the input features
X = df.drop(columns=['name', 'origin'])
# Create a dataframe y containing the output feature origin
y = df[['origin']]

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 12)

In [32]:
# Get user-input n_estimators and max_features
estimators = 100
max_features = 3

# Initialize and fit a random forest classifier with user-input number of decision trees, 
# user-input number of features considered at each split, and a random state of 123
rfModel = RandomForestClassifier(n_estimators = estimators, max_features = max_features, oob_score=True, random_state=123)
rfModel.fit(X_train, np.ravel(y_train))

# Calculate prediction accuracy
score_train = rfModel.score(X_train, y_train)
score_test = rfModel.score(X_test, y_test)
print(round(score_train, 4))
print(round(score_test, 4),'\n')

# Calculate the permutation importance using the default parameters and a random state of 123
result = permutation_importance(rfModel, X_test, y_test, random_state=123)

# Variable importance table
importance_table = pd.DataFrame(
    data={'feature': rfModel.feature_names_in_,'permutation importance': result.importances_mean}
).sort_values('permutation importance', ascending=False)
print(rfModel.oob_score_,'\n')
print(importance_table)

1.0
0.9114 

0.8498402555910544 

        feature  permutation importance
2  displacement                0.534177
4        weight                0.146835
3    horsepower                0.083544
0           mpg                0.068354
5  acceleration                0.060759
6    model_year                0.022785
1     cylinders                0.015190


In [36]:
y_pred = rfModel.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[ 9  2  2]
 [ 1 21  1]
 [ 1  0 42]]


In [38]:
accuracy = accuracy_score(y_test, y_pred)
precision_macro = precision_score(y_test, y_pred, average='macro')
recall_macro = recall_score(y_test, y_pred, average='macro')

print("Accuracy:", accuracy)
print("Macro Precision:", precision_macro)
print("Macro Recall:", recall_macro)

Accuracy: 0.9113924050632911
Macro Precision: 0.8881862099253404
Macro Recall: 0.8606984522050244


In [42]:
accuracy = accuracy_score(y_test, y_pred)
precision_weighted = precision_score(y_test, y_pred, average='weighted')
recall_weighted = recall_score(y_test, y_pred, average='weighted')

print("Accuracy:", accuracy)
print("Macro Precision:", precision_weighted)
print("Macro Recall:", recall_weighted)

Accuracy: 0.9113924050632911
Macro Precision: 0.9084771768316071
Macro Recall: 0.9113924050632911


In [9]:
feature_names = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year']
new_data = [[18, 8, 305, 132, 3504, 12, 70]]
new_data_df = pd.DataFrame(new_data, columns = feature_names)

In [10]:
new_data_df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model_year
0,18,8,305,132,3504,12,70


In [11]:
new_prediction = rfModel.predict(new_data_df)
print(new_prediction)

['usa']


In [12]:
new_data1 = [[30, 4, 100, 75, 2504, 12, 70]]
new_data1_df = pd.DataFrame(new_data1, columns = feature_names)
print(new_data1_df.head())
new_prediction1 = rfModel.predict(new_data1_df)
print(new_prediction1)

   mpg  cylinders  displacement  horsepower  weight  acceleration  model_year
0   30          4           100          75    2504            12          70
['europe']
