# Decision Trees & Random Forests

In [1]:
###Checking which version of python is being used

from platform import python_version

print(python_version())

3.7.4


In [2]:
###Importing the necessary packages

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%config InlineBackend.figure_format ='retina'
%matplotlib inline

In [None]:
###Importing & checking the data

data = pd.read_csv("framingham.csv")
print(data.head())
print(data.shape)
print(data.dtypes)

In [None]:
### Representing the data

fig = plt.figure(figsize = (15,20));
ax = fig.gca();
data.hist(ax = ax);

In [None]:
###Checking correlation between data

data_corr = data.corr();
sns.heatmap(data_corr);

In [None]:
data = data.drop(['education','glucose'], axis=1);
print(data.head())
print(data.shape)

In [None]:
###Dropping missing data

print(data.isna().sum());
data = data.dropna();
print(data.isna().sum());
data.columns;


In [None]:
###Spliting features and target variable
target = data.iloc[:,-1]
features = data.iloc[:,:-1]

print(target.head())
print(target.shape)
print(features.head())
print(features.shape)

In [None]:
###Determining the top features using selectkbest, the higher the score the more likely they are linked

best = SelectKBest(score_func=chi2, k=10)
fit = best.fit(features, target)
data_scores = pd.DataFrame(fit.scores_)
data_columns = pd.DataFrame(features.columns)

feature_scores = pd.concat([data_columns,data_scores],axis=1) #Putting both into a single dataframe for clarity
feature_scores.columns = ['Feature','Score'] #Renaming columns

print(feature_scores.nlargest(10,'Score')) #Printing the top 10 features descending

In [None]:
###Plotting a histogram based on the top features

feature_scores = feature_scores.sort_values(by='Score', ascending=False);

sns.barplot(x = 'Feature', y = 'Score', data = feature_scores)

plt.box(False)
plt.title('Feature importance', fontsize=16)
plt.xlabel('Features', fontsize=14)
plt.ylabel('Importance', fontsize=14)
plt.xticks(fontsize=12, rotation=45, ha="right")
plt.yticks(fontsize=12)
plt.show();

In [None]:
###Making a list of top features

feature_list = feature_scores['Feature'].tolist()[0:10]

print(feature_list)

In [None]:
###Deleting data with less correlation with CHD from original dataframe

truncated_data = pd.concat([data[feature_list],data['TenYearCHD']],axis=1)

print(truncated_data.head())

In [None]:
###Seeing correlation between truncated data

truncated_corr = truncated_data.corr()
sns.heatmap(truncated_corr);

In [None]:
###Outlier detection

truncated_data.describe()
sns.pairplot(truncated_data);

In [None]:
###Boxplots

column_name = list(truncated_data.columns);


for i in range(len(column_name)):
    sns.boxplot(truncated_data['{}'.format(column_name[i])]);
    plt.show();


In [None]:
###Cleaning up the dataframe

def cleaning(dataframe, feature, upper):
    
    dataframe_original = dataframe
    dataframe = dataframe.drop(dataframe[dataframe['{}'.format(feature)] > upper].index);
    print('Shape: {} ---> {}'.format(dataframe_original.shape, dataframe.shape))
    return dataframe;

truncated_data = cleaning(truncated_data, 'totChol', 599);
truncated_data = cleaning(truncated_data, 'BMI', 50);
truncated_data = cleaning(truncated_data, 'cigsPerDay', 50);
truncated_data = cleaning(truncated_data, 'sysBP', 250);
print('Shape: {}' .format(truncated_data.shape));

In [None]:
###Feature scaling

scaler = MinMaxScaler(feature_range=(0,1)) 
scaled_data = pd.DataFrame(scaler.fit_transform(truncated_data), columns=truncated_data.columns)

print(scaled_data.head())

In [None]:
###Test
y = scaled_data.iloc[:,-1]
x = scaled_data.iloc[:,:-1]

print(y.head())
print(x.head())

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=1)

print(len(x_train), len(x_test))

In [None]:
###Resampling dataset
target_count = scaled_data.TenYearCHD.value_counts()

print(target_count[0])
print(target_count[1])
print(round(target_count[0] / target_count[1], 2), ': 1')

sns.countplot(scaled_data.TenYearCHD)
plt.box(False)
plt.xlabel('Heart Disease No/Yes',fontsize=11)
plt.ylabel('Patient Count',fontsize=11)
plt.title('Count Outcome Heart Disease\n')
plt.show();

In [None]:
shuffled_data = scaled_data.sample(frac=1,random_state=1)

CHD_data = shuffled_data.loc[shuffled_data['TenYearCHD'] == 1]
non_CHD_data = shuffled_data.loc[shuffled_data['TenYearCHD'] == 0].sample(n=target_count[1], random_state = 1)

print(CHD_data.shape)
print(non_CHD_data.shape)



In [None]:
normalised_data = pd.concat([CHD_data, non_CHD_data])
normalised_data.TenYearCHD.value_counts()

normalised_target_count = normalised_data.TenYearCHD.value_counts()

sns.countplot(normalised_data.TenYearCHD)
print(round(normalised_target_count[0] / normalised_target_count[1], 2), ': 1')
plt.box(False)
plt.xlabel('Heart Disease No/Yes',fontsize=11)
plt.ylabel('Patient Count',fontsize=11)
plt.title('Count Outcome Heart Disease\n')
plt.show();

In [None]:
y_train = normalised_data.iloc[:,-1]
x_train = normalised_data.iloc[:,:-1]

In [None]:
def scores(model, y_test, pred):
    accuracy = accuracy_score(y_test, pred)
    f1 = f1_score(y_test, pred)
    precision = precision_score(y_test, pred)
    recall = recall_score (y_test, pred)
    
    print("The accuracy score for {} is: {}%.".format(model, round(accuracy,3)*100))
    print("The f1 score for {} is: {}%.".format(model, round(f1,3)*100))
    print("The precision score for {} is: {}%.".format(model, round(precision,3)*100))
    print("The recall score for {} is: {}%.".format(model, round(recall,3)*100))
    
    return(accuracy, f1, precision, recall)
    

In [None]:
def confusion(dataframe, name):
    sns.heatmap(pd.DataFrame(dataframe), annot=True , fmt='g')
    ax.xaxis.set_label_position("top")
    plt.title('Confusion matrix {}\n'.format(name), y=1.1)

In [None]:
def model(classifier, name):
    classifier.fit(x_train, y_train);
    classifier_pred = classifier.predict(x_test);
    confusion(confusion_matrix(y_test, classifier_pred), name);
    scores(name, y_test, classifier_pred);

In [None]:
dtc = DecisionTreeClassifier();
model(dtc, 'Decision Trees')

In [None]:
rfc = RandomForestClassifier(n_estimators= 100,bootstrap = True, max_features = 'sqrt');
model(rfc, 'Random Forest')

## Looking at the number of trees

In [3]:
from preprocessing_ml import *

Importing the preprocessing module for the Exeter NatSci Machine Learning Group.....
Successfully imported the preprocessing module


In [4]:
X_train, X_test, y_train, y_test = split_data(scale_data(drop_missing(chose_features(pd.read_csv("framingham.csv")))))

In [61]:
def scores(name, y_test, pred, display):
    accuracy = accuracy_score(y_test, pred);
    f1 = f1_score(y_test, pred);
    precision = precision_score(y_test, pred);
    recall = recall_score(y_test, pred);
    
    if display == 'yes':
        print("The accuracy score for {} is: {}%.".format(name, round(accuracy, 3) * 100))
        print("The f1 score for {} is: {}%.".format(name, round(f1, 3) * 100))
        print("The precision score for {} is: {}%.".format(name, round(precision, 3) * 100))
        print("The recall score for {} is: {}%.".format(name, round(recall, 3) * 100))
    else:
        
    

    return(accuracy, f1, precision, recall);

SyntaxError: 'continue' not properly in loop (<ipython-input-61-6678925b16d7>, line 16)

In [62]:
def model(classifier, name, X_train, X_test, y_train, y_test):
    classifier.fit(X_train, y_train);
    classifier_pred = classifier.predict(X_test);
    score = scores(name, y_test, classifier_pred, no);
    
    accuracy = score[0];
    f1 = score[1];
    precision = score[2];
    recall = score[3];
    
    return(accuracy, f1, precision, recall);

In [59]:
model(DecisionTreeClassifier(), 'Decision Tree', X_train, X_test, y_train, y_test)

NameError: name 'no' is not defined

In [44]:
model(RandomForestClassifier(n_estimators=10, bootstrap=True, max_features='sqrt'), 'Random Forest', X_train, X_test, y_train, y_test)

The accuracy score for Random Forest is: 83.3%.
The f1 score for Random Forest is: 19.7%.
The precision score for Random Forest is: 51.7%.
The recall score for Random Forest is: 12.2%.


(0.8333333333333334,
 0.19736842105263158,
 0.5172413793103449,
 0.12195121951219512)

In [46]:
i = 100

model(RandomForestClassifier(n_estimators=i, bootstrap=True, max_features='sqrt'), 'Random Forest', X_train, X_test, y_train, y_test)[0]

The accuracy score for Random Forest is: 82.8%.
The f1 score for Random Forest is: 11.3%.
The precision score for Random Forest is: 42.1%.
The recall score for Random Forest is: 6.5%.


0.8278688524590164

In [56]:
y = np.linspace(1,100,100);
#y = np.int(y)
print(y)
accuracy_list = []
for i in range(len(y)):
    accuracy_list.append(model(RandomForestClassifier(n_estimators=i+1, bootstrap=True, max_features='sqrt'), 'Random Forest', X_train, X_test, y_train, y_test)[0]);
    
print(accuracy_list)

[  1.   2.   3.   4.   5.   6.   7.   8.   9.  10.  11.  12.  13.  14.
  15.  16.  17.  18.  19.  20.  21.  22.  23.  24.  25.  26.  27.  28.
  29.  30.  31.  32.  33.  34.  35.  36.  37.  38.  39.  40.  41.  42.
  43.  44.  45.  46.  47.  48.  49.  50.  51.  52.  53.  54.  55.  56.
  57.  58.  59.  60.  61.  62.  63.  64.  65.  66.  67.  68.  69.  70.
  71.  72.  73.  74.  75.  76.  77.  78.  79.  80.  81.  82.  83.  84.
  85.  86.  87.  88.  89.  90.  91.  92.  93.  94.  95.  96.  97.  98.
  99. 100.]
The accuracy score for Random Forest is: 74.5%.
The f1 score for Random Forest is: 22.400000000000002%.
The precision score for Random Forest is: 22.900000000000002%.
The recall score for Random Forest is: 22.0%.
The accuracy score for Random Forest is: 81.69999999999999%.
The f1 score for Random Forest is: 11.799999999999999%.
The precision score for Random Forest is: 31.0%.
The recall score for Random Forest is: 7.3%.
The accuracy score for Random Forest is: 80.10000000000001%.
The f1

The accuracy score for Random Forest is: 83.2%.
The f1 score for Random Forest is: 7.5%.
The precision score for Random Forest is: 50.0%.
The recall score for Random Forest is: 4.1000000000000005%.
The accuracy score for Random Forest is: 82.89999999999999%.
The f1 score for Random Forest is: 15.0%.
The precision score for Random Forest is: 45.800000000000004%.
The recall score for Random Forest is: 8.9%.
The accuracy score for Random Forest is: 83.89999999999999%.
The f1 score for Random Forest is: 16.900000000000002%.
The precision score for Random Forest is: 63.2%.
The recall score for Random Forest is: 9.8%.
The accuracy score for Random Forest is: 83.5%.
The f1 score for Random Forest is: 15.4%.
The precision score for Random Forest is: 55.00000000000001%.
The recall score for Random Forest is: 8.9%.
The accuracy score for Random Forest is: 83.6%.
The f1 score for Random Forest is: 14.299999999999999%.
The precision score for Random Forest is: 58.8%.
The recall score for Random Fo

The accuracy score for Random Forest is: 83.3%.
The f1 score for Random Forest is: 11.600000000000001%.
The precision score for Random Forest is: 53.300000000000004%.
The recall score for Random Forest is: 6.5%.
The accuracy score for Random Forest is: 83.3%.
The f1 score for Random Forest is: 14.099999999999998%.
The precision score for Random Forest is: 52.6%.
The recall score for Random Forest is: 8.1%.
The accuracy score for Random Forest is: 82.8%.
The f1 score for Random Forest is: 10.0%.
The precision score for Random Forest is: 41.199999999999996%.
The recall score for Random Forest is: 5.7%.
The accuracy score for Random Forest is: 83.3%.
The f1 score for Random Forest is: 10.299999999999999%.
The precision score for Random Forest is: 53.800000000000004%.
The recall score for Random Forest is: 5.7%.
The accuracy score for Random Forest is: 83.1%.
The f1 score for Random Forest is: 12.7%.
The precision score for Random Forest is: 47.4%.
The recall score for Random Forest is: 7.