In [4]:
import pandas as pd
from sklearn import tree
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix
import pydotplus
from IPython.display import Image

diabetes_df = pd.read_csv("diabetes.csv")
diabetes_df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = diabetes_df.drop('Outcome', axis=1)
y = diabetes_df['Outcome']

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify=y)

#Standardize
sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

In [75]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, max_depth=1,  random_state =42, min_samples_split=2, min_samples_leaf=5, 
                            min_weight_fraction_leaf=0.1, min_impurity_split=0.1)
#what is an estimator?  models

rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)



0.7012987012987013

In [76]:
predictions = rf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.69      0.99      0.81       100
           1       0.90      0.17      0.28        54

    accuracy                           0.70       154
   macro avg       0.79      0.58      0.55       154
weighted avg       0.76      0.70      0.63       154



## 1. 
Write simple (straightforward) definitions for the following parameters for RandomForestClassifier (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClass ifier.html) and indicate how they correlate with the precision and recall for the basic diabetes model we built in class. You will need to rerun the model multiple times to do so.

**Estimators** - are the number of trees in the forest; the precision and recall improves with a bigger amount of trees and it drops with less trees.

**max_depth** - the maximum depth of the tree; increasing max depth increase my recall by a lot (to 0.99), but also decreased the precision by pretty a lot as well.

**min_samples_split**- the minimum number of samples required to split an internal node; higher min sample split increased the precision but kept the recall the same

**min_samples_leaf** - the minimum number of samples required to be at a leaf node. A split point at any depth will only be considered if it leaves at least min_samples_leaf training samples in each of the left and right branches. This may have the effect of smoothing the model, especially in regression; higher min sample leaf decreses the precision but increases the recall

**min_weight_fraction_leaf** - the minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node. Samples have equal weight when sample_weight is not provided; higher min sample leaf decreses the precision but increases the recall as well.

**max_leaf_nodes** - changing the number for max_leaf_nodes didn't influence my precision and recall.


**min_impurity_decrease** - increasing min_impurity_decreases the precision but increases the recall.

**min_impurity_split** - slightly increasing min_impurity_split improves the precision but slightly descreases the recall.

## 2. 
How does setting bootstrap=False influence the model performance? Note: the default is bootstrap=True. Explain why your results might be so.

In [83]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, random_state =42, bootstrap=False)

rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.7467532467532467

In [84]:
predictions = rf.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.79      0.83      0.81       100
           1       0.65      0.59      0.62        54

    accuracy                           0.75       154
   macro avg       0.72      0.71      0.72       154
weighted avg       0.74      0.75      0.74       154



The model performance score slightly goes down when we set bootstrap to false. The precision stays the same but recall decreases when we set bootstrap to False.