# Ford used car analysis

Submitted by:
<br>Nabila Tajrin Bristy
<br>Dhaka, Bangladesh

#### Objective
- One-hot Encoding
- random state and stratification
- Decision Tree Classifier
- Confusion Matrix, Classification report, and ROC-AUC

#### Tasks:
1. Load the dataset
2. Explain the features and target variable
3. Deal with the missing values
4. Perform One-hot Encoding
5. Split the dataset into 80% training and 20% for testing. Add random state and stratification
6. Perform the training with Decision Tree Classifier
7. Show a tree diagram of the Decision Tree
8. Show the Confusion Matrix, Classification report, and ROC-AUC
9. Explain your outcome.


#### References:
- Dataset source: https://archive.ics.uci.edu/ml/datasets/heart+disease

### Import required libraries and packages

In [3]:
import pandas as pd #to load and manipulate data and for one-hot encoding
import numpy as np #to calculate the mean and standard deviation
from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier #to build a classification tree
from sklearn.tree import plot_tree # to draw a classification tree
from sklearn.metrics import plot_confusion_matrix # to create a confusion matrix
from sklearn.model_selection import GridSearchCV 

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, precision_score, recall_score, f1_score

import matplotlib.pyplot as plt #to draw fraphs
import seaborn as sns

### Import the data

In [6]:
# read the ford.csv file 
df = pd.read_csv('processed.cleveland.data', header = None)

display(df.head(10))
print(df.shape)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,3.0,0
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1


(303, 14)


In [7]:
#print the first 5 rowa
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [9]:
df.columns = ['age',
              'sex',
              'cp',
              'restbp',
              'chol',
              'fbs',
              'restecg',
              'thalach',
              'exang',
              'oldpeak',
              'slope',
              'ca',
              'thal',
              'hd',
             ]

#print the first 5 rows
df.head()

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


### Identifying missing data

In [11]:
# dtypes tell us the 'data types' for each column
df.dtypes

age        float64
sex        float64
cp         float64
restbp     float64
chol       float64
fbs        float64
restecg    float64
thalach    float64
exang      float64
oldpeak    float64
slope      float64
ca          object
thal        object
hd           int64
dtype: object

In [12]:
# print out unique values in the column called 'ca'
df['ca'].unique()

array(['0.0', '3.0', '2.0', '1.0', '?'], dtype=object)

In [13]:
# print out unique values in the column called 'that'
df['thal'].unique()

array(['6.0', '3.0', '7.0', '?'], dtype=object)

# Missing Data Part 2: Dealing with missing data

In [15]:
len(df.loc[(df['ca'] == '?')
           |
           (df['thal'] == '?')])

6

In [17]:
df.loc[(df['ca'] == '?')
           |
           (df['thal'] == '?')]

Unnamed: 0,age,sex,cp,restbp,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,hd
87,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,?,0
166,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,?,3.0,0
192,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,?,7.0,1
266,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,?,2
287,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,?,7.0,0
302,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


Now, let's count the number of rows in the full datatset

In [18]:
len(df)

303

Let's remove the rows with all missing values by selecting all of the rows that do not contain question marks in either the 'ca' or 'thal' columns:

In [19]:
df_no_missing = df.loc[(df['ca'] != '?')
                      &
                      (df['thal'] != '?')]

Since, 'df_no_missing' has 6 fewer rows than the original df, it should have 297 rows.

In [20]:
len(df_no_missing)

297

Horray! The math works out. However, we can also make sure 'ca' no longer contains questions marks by printing its unique values:

In [21]:
df_no_missing['ca'].unique()

array(['0.0', '3.0', '2.0', '1.0'], dtype=object)

And, we can also do the same thing for 'thal':

In [22]:
df_no_missing['thal'].unique()

array(['6.0', '3.0', '7.0'], dtype=object)

### Separating the numeric features and target variable

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1229 entries, 0 to 1228
Data columns (total 1 columns):
3001 0 65 1 1 1 1    1229 non-null object
dtypes: object(1)
memory usage: 9.7+ KB


In [12]:
print(df.columns)

Index(['3001 0 65 1 1 1 1'], dtype='object')


In [21]:
features = ['3001', '0', '65', '1111']
target = ['price']

X = df[features]
y = df[target]

print(X.shape, y.shape)

KeyError: "None of [Index(['3001', '0', '65', '1111'], dtype='object')] are in the [columns]"

### Deal with the missing values

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17964 entries, 0 to 17963
Data columns (total 8 columns):
model           17964 non-null object
year            17964 non-null int64
price           17964 non-null int64
transmission    17964 non-null object
mileage         17964 non-null int64
fuelType        17964 non-null object
mpg             17964 non-null float64
engineSize      17964 non-null float64
dtypes: float64(2), int64(3), object(3)
memory usage: 1.1+ MB


#### Target variable: 'hd'

In [9]:
print(df['price'].value_counts())

10000    164
11000    153
10500    148
12000    126
9000     118
        ... 
12449      1
12465      1
6324       1
16591      1
12282      1
Name: price, Length: 3511, dtype: int64


In [11]:
df[df['price'] > 1] = 1

print(df['price'].value_counts())

1    17964
Name: price, dtype: int64


In [12]:
display(df.describe())

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize
count,17964.0,17964.0,17964.0,17964.0,17964.0,17964.0,17964.0,17964.0
mean,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
std,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
50%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#### Create feature set and target

In [14]:
X = df.drop('price', axis=1)
y = df[['price']]

print(X.shape, y.shape)

(17964, 7) (17964, 1)


### Split the original dataset into the train set (80%) and the test set (20%)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Perform Linear Regression and Predict the 'Price' from the test set

In [None]:
model = LinearRegression()
model = model.fit(X_train, y_train)

### Prediction

In [None]:
y_pred = model.predict(X_test)
print(y_pred)

In [None]:
print(y_test)

### Show the coefficients of your linear regression model for each feature and show the y-intercept value of your linear regression model. (Interpret the coefficients)

### Find the MAE, MAPE, MSE, RMSE, coefficient of determination values from the actual target variable and the predicted target variable.

In [None]:
actual_value = [1,2,3,4,5,6,7,8,9,10]
predicted_value = [1,3,4,5,6,5,6,5,8,9]

In [None]:
df = pd.DataFrame({"actual":actual_value, "predicted":predicted_value})

# find the difference between the actual value and the predicted value
df["dif"] = df["actual"] - df["predicted"]

# find the absolute difference between the actual value and the predicted value
df["abs_error"] = np.abs(df["dif"])

# find the squared difference between the actual value and the predicted value
df["squared_error"] = df["dif"]**2

# find the actual value minus the mean of the actual values
df["actual_subtract_mean"] = df["actual"] - df["actual"].mean()

# find the square of actual value minus the mean of the actual values
df["squared_actual_subtract_mean"] = df["actual_subtract_mean"]**2

display(df)

In [None]:
# mean absolute error: lower is better
MAE = df["abs_error"].mean()
print("mean absolute error = ", MAE)

# mean absolute percentage error: : lower is better
MAPE = np.round(np.mean(df["abs_error"]/df["actual"]) * 100, 2)
print("mean absolute percentage error = ", MAPE, "%")

# mean squared error: lower is better
MSE = df["squared_error"].mean()
print("mean squared error = ", MSE)

# root mean squared error: lower is better
RMSE = np.round(np.sqrt(MSE),2)
print("root mean squared error = ", RMSE)

# coefficient of determination: higher is better
r_squared = np.round(1 - df["squared_error"].sum()/df["squared_actual_subtract_mean"].sum(), 2)
print("coefficient of determination = ", r_squared)

In [None]:
# mean absolute error
MAE = mean_absolute_error(df["actual"], df["predicted"])
print("mean absolute error = ", MAE)

# mean squared error
MSE = mean_squared_error(df["actual"], df["predicted"])
print("mean squared error = ", MSE)

# root mean squared error
RMSE = sqrt(MSE)
print("root mean squared error = ", RMSE)

# coefficient of determination
r_squared = r2_score(df["actual"], df["predicted"]) 
print("coefficient of determination = ", r_squared)

### Compare the actual and predicted target variable through visualization

#### Load data

In [None]:
df = pd.read_csv("ford.csv")

display(df.head(10))
print(df.shape)
print(df.info())
print(df.columns)

#### One-hot Encoding

In [None]:
df = pd.get_dummies(df, drop_first=True)

#### Categorcial Encoding

In [None]:
# LE = LabelEncoder()

# df['transmission'] = LE.fit_transform(df['transmission'])
# df['fuelType'] = LE.fit_transform(df['fuelType'])
# df['model'] = LE.fit_transform(df['model'])
# df['year'] = LE.fit_transform(df['year'])

# df['transmission'] = df['transmission'].astype('category')
# df['fuelType'] = df['fuelType'].astype('category')
# df['model'] = df['model'].astype('category')
# df['year'] = df['year'].astype('category')

# display(df.head(10))
# print(df.info())

#### Separating the features and target variable

In [None]:
# only numeric features
# features = ['mileage']
# features = ['mileage', 'year']
# features = ['mileage', 'year', 'tax']
# features = ['mileage', 'year', 'tax', 'mpg']
# features = ['mileage', 'year', 'tax', 'mpg', 'engineSize']

target = ['price']

# X = df[features]
X = df.drop('price', axis=1)
y = df[target]

print(X.shape, y.shape)

### Perform the training with Decision Tree Classifier

#### Create train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

#### Training with Linear Regression

In [None]:
model = LinearRegression()
model = model.fit(X_train, y_train)

#### Linear Regression Coefficients and intercept

In [None]:
coefficients = pd.DataFrame({'features':X.columns, 'coefficients':np.squeeze(model.coef_)})
coefficients = coefficients.sort_values(by='coefficients')
display(coefficients)

In [None]:
model.intercept_

#### Prediction

In [None]:
y_pred = model.predict(X_test)

#### Prediction Error

In [None]:
# mean absolute error
MAE = mean_absolute_error(y_test, y_pred)
print("mean absolute error = ", MAE)

# mean squared error
MSE = mean_squared_error(y_test, y_pred)
print("mean squared error = ", MSE)

# root mean squared error
RMSE = sqrt(MSE)
print("root mean squared error = ", RMSE)

# coefficient of determination
r_squared = r2_score(y_test, y_pred) 
print("coefficient of determination = ", r_squared)

In [None]:
y_test['pred'] = y_pred
y_test['x'] = np.arange(0,y_test.shape[0],1).tolist()

plt.figure(figsize=(30,12))

sns.lineplot(x='x', y='price', data=y_test, label='actual target')
sns.lineplot(x='x', y='pred', data=y_test, label='predicted target')
plt.show()

In [None]:
plt.figure(figsize=(30,12))

sns.scatterplot(x='price', y='pred', data=y_test)

sns.scatterplot(x='price', y='price', data=y_test, color='red')
plt.show()

### Show a tree diagram of the Decision Tree

### ROC: Receiver Operating Characterisitcs and AUC: Area Under the Curve

In [None]:

fpr, tpr, thr = roc_curve(y_test['Churn'], 
                          y_test['probability'])
auc = np.round(roc_auc_score(y_test['Churn'], 
                             y_test['predicted_Churn']), 2)

plt.figure(figsize=(10, 8))
plt.plot(fpr, 
         tpr, 
         color='green', 
         lw=2, 
         label="Curve Area = " +str(auc))

plt.plot([0, 1], [0, 1], color='blue', lw=2, linestyle='--')
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()