# Monogram2-week8-notebook1

# Decision Trees and Random Forest


## Import Libraries
Let's import some libraries to get started!

In [30]:
import pandas as pd

## The Data



In [31]:
df = pd.read_csv('preprocessed_dataset.csv')

In [32]:
df.head()

Unnamed: 0.1,Unnamed: 0,PayloadMass,Flights,GridFins,Reused,Legs,Block,ReusedCount,Class,Orbit_ES-L1,...,Serial_B1048,Serial_B1049,Serial_B1050,Serial_B1051,Serial_B1054,Serial_B1056,Serial_B1058,Serial_B1059,Serial_B1060,Serial_B1062
0,0,6104.959412,1,0,0,0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,525.0,1,0,0,0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,677.0,1,0,0,0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,500.0,1,0,0,0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,3170.0,1,0,0,0,1.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Exploratory Data Analysis

Let's begin some exploratory data analysis! We'll start by checking out missing data!

## Missing Data



In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 89 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Unnamed: 0                           90 non-null     int64  
 1   PayloadMass                          90 non-null     float64
 2   Flights                              90 non-null     int64  
 3   GridFins                             90 non-null     int64  
 4   Reused                               90 non-null     int64  
 5   Legs                                 90 non-null     int64  
 6   Block                                90 non-null     float64
 7   ReusedCount                          90 non-null     int64  
 8   Class                                90 non-null     int64  
 9   Orbit_ES-L1                          90 non-null     int64  
 10  Orbit_GEO                            90 non-null     int64  
 11  Orbit_GTO                         

# Define X , y

In [5]:
X=df.drop('Class',axis=1)
y=df['Class']

Great! Our data is ready for our model!

# Building a Decision tree Model

Let's start by splitting our data into a training set and test set

## Train Test Split

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, test_size=0.3, 
                                                    random_state=101)

## Training and Predicting

In [8]:
# Decision Tree classification algorithm
from sklearn.tree import DecisionTreeClassifier

In [9]:
tree = DecisionTreeClassifier()

In [10]:
tree.fit(X_train,y_train)

DecisionTreeClassifier()

In [11]:
predictions = tree.predict(X_test)

In [12]:
predictions

array([0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1], dtype=int64)

In [13]:
y_test

50    0
6     1
51    0
54    1
53    1
69    1
32    1
31    1
21    1
88    1
43    1
47    0
3     0
1     0
74    0
16    1
45    0
25    1
2     0
13    0
56    1
76    0
73    1
41    1
14    0
23    1
37    1
Name: Class, dtype: int64

Let's move on to evaluate our model!

## Evaluation

Let's bring Confusion Matrix!

In [14]:
from sklearn.metrics import confusion_matrix

In [15]:
confusion_matrix(y_test,predictions)

array([[ 9,  2],
       [ 1, 15]], dtype=int64)

We can check precision,recall,f1-score using classification report!

In [16]:
from sklearn.metrics import accuracy_score

In [17]:
accuracy_score(y_test,predictions, normalize=False)

24

In [18]:
accuracy_score(y_test,predictions, normalize=True)

0.8888888888888888

In [19]:
from sklearn.metrics import classification_report

In [20]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.90      0.82      0.86        11
           1       0.88      0.94      0.91        16

    accuracy                           0.89        27
   macro avg       0.89      0.88      0.88        27
weighted avg       0.89      0.89      0.89        27




## Grid Search for Decision Tree!

In [21]:
# Allows us to test parameters of classification algorithms and find the best one
from sklearn.model_selection import GridSearchCV

In [22]:
tree_1 = DecisionTreeClassifier()

In [23]:
parameters = {'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5]}

In [24]:
tree_cv = GridSearchCV(tree_1, parameters)
tree_cv.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5]})

In [25]:
print("tuned hpyerparameters :(best parameters) ",tree_cv.best_params_)

tuned hpyerparameters :(best parameters)  {'min_samples_leaf': 1, 'min_samples_split': 5}


In [26]:
tree_1 = DecisionTreeClassifier(min_samples_leaf= 1, min_samples_split= 5)

In [27]:
tree_1.fit(X_train,y_train)

DecisionTreeClassifier(min_samples_split=5)

In [28]:
predictions_1 = tree_1.predict(X_test)

In [29]:
confusion_matrix(y_test,predictions_1)

array([[ 9,  2],
       [ 1, 15]], dtype=int64)

In [30]:
accuracy_score(y_test,predictions_1, normalize=False)

24

In [31]:
accuracy_score(y_test,predictions_1, normalize=True)

0.8888888888888888

# Building a Random Forest Model

In [32]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

RandomForestClassifier()

In [33]:
rfc_pred = rfc.predict(X_test)

In [34]:
print(confusion_matrix(y_test,rfc_pred))

[[ 8  3]
 [ 0 16]]


In [35]:
accuracy_score(y_test,predictions, normalize=False)

24

In [36]:
accuracy_score(y_test,predictions, normalize=True)

0.8888888888888888

In [37]:
print(classification_report(y_test,rfc_pred))

              precision    recall  f1-score   support

           0       1.00      0.73      0.84        11
           1       0.84      1.00      0.91        16

    accuracy                           0.89        27
   macro avg       0.92      0.86      0.88        27
weighted avg       0.91      0.89      0.88        27



## Grid Search for Random Forest!

In [38]:
rfc_1 = RandomForestClassifier()

In [39]:
parameters = {'min_samples_leaf': [1, 2, 4],
     'min_samples_split': [2, 5, 10],  'n_estimators': [10,20,30] }

In [40]:
rfc_cv = GridSearchCV(rfc_1, parameters)
rfc_cv.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [10, 20, 30]})

In [41]:
print("tuned hpyerparameters :(best parameters) ",rfc_cv.best_params_)

tuned hpyerparameters :(best parameters)  {'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 10}


In [42]:
rfc_1 = RandomForestClassifier( n_estimators= 10, min_samples_leaf= 1, min_samples_split= 2)

In [43]:
rfc_1.fit(X_train,y_train)

RandomForestClassifier(n_estimators=10)

In [44]:
predictions_1 = rfc_1.predict(X_test)

In [45]:
confusion_matrix(y_test,predictions_1)

array([[11,  0],
       [ 1, 15]], dtype=int64)

In [46]:
accuracy_score(y_test,predictions_1, normalize=False)

26

In [47]:
accuracy_score(y_test,predictions_1, normalize=True)

0.9629629629629629

# Good Job!