# Decision Tree & Random Forest - Use Case

## For visualizing the decision tree & random forest estimators install graphviz
https://graphviz.org/download/

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, confusion_matrix, f1_score
import numpy as np
import os
from pathlib import Path
from os import system
from sklearn import tree
import seaborn as sns

cwd = os.getcwd()

In [None]:
RandomForestRegressor?

In [None]:
df = pd.read_csv(Path(cwd, 'boston.csv'))
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

dtr = DecisionTreeRegressor(max_depth = 10)
rfr = RandomForestRegressor(max_depth = 10, n_estimators=15, oob_score=True, bootstrap=True)

dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)

rfr.fit(X_train, y_train)
y_predr = rfr.predict(X_test)

print("RMSE for Decision Tree training is " ,np.sqrt(mean_squared_error(dtr.predict(X_train), y_train)))

acc = np.sqrt(mean_squared_error(y_pred, y_test))
print("RMSE for Decision Tree Prediction is " ,acc)

print('Train Accuracy for Decision Tree Regressor is :', cross_val_score(dtr, X_train, y_train, scoring='r2', cv = 10).mean())

print('Test Accuracy for Decision Tree Regressor is :', cross_val_score(dtr, X_test, y_test, scoring='r2', cv = 10).mean())

print("RMSE for Random Forest training is " ,np.sqrt(mean_squared_error(rfr.predict(X_train), y_train)))

accr = np.sqrt(mean_squared_error(y_predr, y_test))
print("RMSE for Random Forest Prediction is " ,accr)

print('Train Accuracy for Random Forest Regressor is :', cross_val_score(rfr, X_train, y_train, scoring='r2', cv = 10).mean())

print('Test Accuracy for Random Forest Regressor is :', cross_val_score(rfr, X_test, y_test, scoring='r2', cv = 10).mean())

In [None]:
path = dtr.cost_complexity_pruning_path(X_train, y_train)

path_regressor = rfr.estimators_[0].cost_complexity_pruning_path(X_train, y_train)

In [None]:
len(path_regressor["ccp_alphas"])

In [None]:
alphas = path["ccp_alphas"]
train_accuracies = []
test_accuracies = []

for i in alphas:
    dtr = DecisionTreeRegressor(ccp_alpha = i)
    dtr.fit(X_train, y_train)
    test_accuracies.append(cross_val_score(dtr, X_test, y_test, scoring='r2', cv = 10).mean())
    train_accuracies.append(cross_val_score(dtr, X_train, y_train, scoring='r2', cv = 10).mean())
    
sns.lineplot(x=alphas, y=train_accuracies, label="Train accuracies")
sns.lineplot(x=alphas, y=test_accuracies, label="Test accuracies")

In [None]:
dtr = DecisionTreeRegressor(max_depth = 10, ccp_alpha=3)
dtr.fit(X_train, y_train)

y_pred_alpha = dtr.predict(X_test)
print("Test accuracy: ", cross_val_score(dtr, X_test, y_test, scoring='r2', cv = 10).mean())
print("Train accuracy: " , cross_val_score(dtr, X_train, y_train, scoring='r2', cv = 10).mean())

In [None]:
dotfile_path = Path(cwd, "dtree_regression.dot")
dotfile = open(dotfile_path, 'w')
tree.export_graphviz(dtr, out_file = dotfile, feature_names = X.columns)
dotfile.close()
system(f"dot -Tpng dtree_regression.dot -o dtree_regression.png")

In [None]:
for i in range(0, len(rfr.estimators_)):
    
    dotfile_name = f"rfr_regressor_{i}.dot"
    png_file_name = f"rfr_regressor_{i}.png"
    
    dotfile_path = Path(cwd, dotfile_name)
    dotfile = open(dotfile_path, 'w')
    tree.export_graphviz(dtr, out_file = dotfile, feature_names = X.columns)
    dotfile.close()
    system(f"dot -Tpng {dotfile_name} -o {png_file_name}")

In [None]:
RandomForestClassifier?

In [None]:
df = pd.read_csv(Path(cwd, "heart.csv"))
X = df.iloc[:,2:5]
y = df.iloc[:,1]

print(df.shape)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

dtr = DecisionTreeClassifier(max_depth = 10)
rfr = RandomForestClassifier(n_estimators=5, max_depth=5, n_jobs=-1, verbose=True, oob_score=True, bootstrap=True)

dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)

rfr.fit(X_train, y_train)
y_predr = rfr.predict(X_test)

print(dtr)
print(rfr)

accr = accuracy_score(y_train, dtr.predict(X_train))
print("Train Accuracy for Decision Tree is " ,accr)

acc = accuracy_score(y_test, y_pred)
print("Test Accuracy for Decision Tree is " ,acc)


accr = accuracy_score(y_train, rfr.predict(X_train))
print("Train Accuracy for Random Forest is " ,accr)

acc = accuracy_score(y_test, y_predr)
print("Test Accuracy for Random Forest is " ,acc)


print("Training F1 Score " ,f1_score(y_train, dtr.predict(X_train)))
print("Training F1 Score for random forest" , f1_score(y_train, rfr.predict(X_train)))
print("Training Confusion Matrix for decision tree " ,confusion_matrix(y_train, dtr.predict(X_train)))
print("Training Confusion Matrix for random forest " ,confusion_matrix(y_train, rfr.predict(X_train)))

print(f1_score(y_test, y_pred))
print(f1_score(y_test, y_predr))
print(confusion_matrix(y_test, y_pred))

In [None]:
dotfile_path = Path(cwd, "dtree_classification.dot")
dotfile = open(dotfile_path, 'w')
tree.export_graphviz(dtr, out_file = dotfile, feature_names = X.columns)
dotfile.close()
system(f"dot -Tpng dtree_classification.dot -o dtree_classification.png")

In [None]:
for i in range(0, len(rfr.estimators_)):
    
    dotfile_name = f"rfr_classifiers_{i}.dot"
    png_file_name = f"rfr_classifiers_{i}.png"
    
    dotfile_path = Path(cwd, dotfile_name)
    dotfile = open(dotfile_path, 'w')
    tree.export_graphviz(dtr, out_file = dotfile, feature_names = X.columns)
    dotfile.close()
    system(f"dot -Tpng {dotfile_name} -o {png_file_name}")

In [None]:
import seaborn as sns

sns.pairplot(df)

In [None]:
titanic_train = pd.read_csv("titanic_train.csv")

#X = titanic_train.iloc[:, :-1]

X = titanic_train[[col for col in titanic_train.columns if col != "Survived" and col != "Name" and col != "Ticket" and col != "Cabin"]]
X = pd.get_dummies(X, columns = ['Embarked', 'Sex']) ## Encoding ==> One Hot Encoding, Label Encoding
X.fillna(0, inplace=True)
y = titanic_train["Survived"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

dtr = DecisionTreeClassifier(max_depth = 10)
rfr = RandomForestClassifier(n_estimators=20)

dtr.fit(X_train, y_train)
y_pred = dtr.predict(X_test)

rfr.fit(X_train, y_train)
y_predr = rfr.predict(X_test)

In [None]:
pd.get_dummies(titanic_train, columns = ['Embarked', 'Sex', 'Cabin'])

In [None]:
titanic_train.hist("Sex", "Survived")

In [None]:
titanic_train[["last_name", "first_name"]] = titanic_train["Name"].str.split(",", 2, expand =True)

titanic_train[["title", "first_name"]] = titanic_train["first_name"].str.split(".", 1, expand =True)

titanic_train["title"].head()

titanic_train.drop(columns=["last_name", "first_name", "title"], inplace=True)

In [None]:
titanic_train["title"].value_counts()

In [None]:
accr = accuracy_score(y_train, dtr.predict(X_train))
print("Train Accuracy for Decision Tree is " ,accr)

acc = accuracy_score(y_test, y_pred)
print("Test Accuracy for Decision Tree is " ,acc)


accr = accuracy_score(y_train, rfr.predict(X_train))
print("Train Accuracy for Random Forest is " ,accr)

acc = accuracy_score(y_test, y_predr)
print("Test Accuracy for Random Forest is " ,acc)


print("Training F1 Score " ,f1_score(y_train, dtr.predict(X_train)))
print("Training F1 Score for random forest" , f1_score(y_train, rfr.predict(X_train)))
print("Training Confusion Matrix for decision tree " ,confusion_matrix(y_train, dtr.predict(X_train)))
print("Training Confusion Matrix for random forest " ,confusion_matrix(y_train, rfr.predict(X_train)))

print(f1_score(y_test, y_pred))
print(f1_score(y_test, y_predr))
print(confusion_matrix(y_test, y_pred))

In [None]:
X.columns

In [None]:
from sklearn.linear_model import LogisticRegression

# Create instance (i.e. object) of LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)

y_pred_log = logmodel.predict(X_test)

acc = accuracy_score(y_test, y_pred_log)
print("Test Accuracy for Logistic Regression is " ,acc)

acc = accuracy_score(y_train, logmodel.predict(X_train))
print("Train Accuracy for Logistic Regression is " ,acc)


print("Training F1 Score " ,f1_score(y_train, logmodel.predict(X_train)))
print("Training Confusion Matrix for Logistic Regression " ,confusion_matrix(y_train, logmodel.predict(X_train)))


print("Test F1 Score " ,f1_score(y_test, y_pred_log))
print("Test Confusion Matrix for Logistic Regression " ,confusion_matrix(y_test, y_pred_log))

print(f1_score(y_test, y_pred_log))
print(confusion_matrix(y_test, y_pred_log))

In [None]:
sns.pairplot(titanic_train)