In [1]:
## Advantages of Random Forest......

# Low variance:- multiple decision trees
# Reduce overfitting:- uses bootstrap aggregation
# Normalization not needed:- works on rule based approach
# Good accuracy:- generalizes well

# It is suitable for both classification and regression problems.
# Works well with both categorical and continuous data.
# Performs well on large datasets.


## Disadvantages of Random Forest.....
# More training time is required
# Interpretation is complex
# More Memory Utilization
# Computationally expensive

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

In [3]:
df = pd.read_csv("D:\CSV Files\penguins.csv")
df.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [4]:
df.shape

(344, 7)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   culmen_length_mm   342 non-null    float64
 3   culmen_depth_mm    342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                334 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [6]:
df.isnull().sum()

species               0
island                0
culmen_length_mm      2
culmen_depth_mm       2
flipper_length_mm     2
body_mass_g           2
sex                  10
dtype: int64

In [7]:
df.dropna(inplace=True)

In [8]:
df.isnull().sum()

species              0
island               0
culmen_length_mm     0
culmen_depth_mm      0
flipper_length_mm    0
body_mass_g          0
sex                  0
dtype: int64

# Feature Engineering

In [9]:
df.sex.unique()

array(['MALE', 'FEMALE', '.'], dtype=object)

In [11]:
pd.get_dummies(df['sex'], drop_first=True).head()

Unnamed: 0,FEMALE,MALE
0,0,1
1,1,0
2,1,0
4,1,0
5,0,1


In [16]:
sex = pd.get_dummies(df['sex'], drop_first=True)
sex.head()

Unnamed: 0,FEMALE,MALE
0,0,1
1,1,0
2,1,0
4,1,0
5,0,1


# # Applying One Hot Encoding

In [17]:
df.island.unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [18]:
pd.get_dummies(df['island']).head()

Unnamed: 0,Biscoe,Dream,Torgersen
0,0,0,1
1,0,0,1
2,0,0,1
4,0,0,1
5,0,0,1


In [20]:
island = pd.get_dummies(df['island'], drop_first = True)
island.head()

Unnamed: 0,Dream,Torgersen
0,0,1
1,0,1
2,0,1
4,0,1
5,0,1


## Concatenate the above two dataframes to the original dataframe

In [21]:
new_data = pd.concat([df,island,sex], axis=1)
new_data.head()

Unnamed: 0,species,island,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,sex,Dream,Torgersen,FEMALE,MALE
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE,0,1,0,1
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE,0,1,1,0
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE,0,1,1,0
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE,0,1,1,0
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,MALE,0,1,0,1


# Drop the repeated columns

In [23]:
new_data.drop(['sex','island','FEMALE'], axis=1, inplace=True)
new_data.head()

Unnamed: 0,species,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,MALE
0,Adelie,39.1,18.7,181.0,3750.0,0,1,1
1,Adelie,39.5,17.4,186.0,3800.0,0,1,0
2,Adelie,40.3,18.0,195.0,3250.0,0,1,0
4,Adelie,36.7,19.3,193.0,3450.0,0,1,0
5,Adelie,39.3,20.6,190.0,3650.0,0,1,1


# Creating Separate target variable

In [24]:
Y = new_data.species
Y.head()

0    Adelie
1    Adelie
2    Adelie
4    Adelie
5    Adelie
Name: species, dtype: object

In [25]:
Y.unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [26]:
Y = Y.map({'Adelie':0, 'Chinstrap':1, 'Gentoo':2})
Y.head()

0    0
1    0
2    0
4    0
5    0
Name: species, dtype: int64

# Dropping the target variable: Species

In [27]:
new_data.drop('species', inplace=True, axis=1)
new_data.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,Dream,Torgersen,MALE
0,39.1,18.7,181.0,3750.0,0,1,1
1,39.5,17.4,186.0,3800.0,0,1,0
2,40.3,18.0,195.0,3250.0,0,1,0
4,36.7,19.3,193.0,3450.0,0,1,0
5,39.3,20.6,190.0,3650.0,0,1,1


In [28]:
X = new_data

# Splitting the dataset into Training & Testing set

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y, test_size=0.3, random_state=0)

In [34]:
print('X_train',X_train.shape)
print('X_test',X_test.shape)
print('Y_train',Y_train.shape)
print('Y_test',X_test.shape)

X_train (233, 7)
X_test (101, 7)
Y_train (233,)
Y_test (101, 7)


# Training Random Forest Classification on Training set

In [35]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 5, criterion='entropy', random_state=0)
classifier.fit(X_train,Y_train)

RandomForestClassifier(criterion='entropy', n_estimators=5, random_state=0)

# Predicting the Test Results

In [36]:
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 2, 1, 2, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 2, 0, 1, 0, 2, 2,
       1, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 2, 2, 0, 2, 0, 1, 0, 2, 2, 1, 2, 0, 2, 2,
       2, 0, 0, 2, 0, 0, 2, 0, 1, 2, 1, 0, 0], dtype=int64)

# Confusion Matrix

In [37]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

In [38]:
cm = confusion_matrix(Y_test,y_pred)
cm

array([[50,  0,  0],
       [ 2, 16,  0],
       [ 0,  0, 33]], dtype=int64)

In [41]:
accuracy_score(Y_test,y_pred)

0.9801980198019802

In [40]:
print(classification_report(Y_test,y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98        50
           1       1.00      0.89      0.94        18
           2       1.00      1.00      1.00        33

    accuracy                           0.98       101
   macro avg       0.99      0.96      0.97       101
weighted avg       0.98      0.98      0.98       101



# Try with Different number of trees and gini criteria 

In [43]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=7, criterion='gini', random_state=0)
classifier.fit(X_train,Y_train)

RandomForestClassifier(n_estimators=7, random_state=0)

In [45]:
y_pred = classifier.predict(X_test)
y_pred

array([1, 0, 0, 2, 1, 2, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 0, 0, 1, 0, 1, 0, 2, 0, 0, 0, 2, 0, 1, 0, 2, 2,
       1, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 2, 0, 2, 0, 2, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 1, 2, 2, 0, 2, 0, 1, 0, 2, 2, 1, 2, 0, 2, 2,
       2, 0, 0, 2, 0, 0, 2, 0, 1, 2, 1, 0, 0], dtype=int64)

In [46]:
accuracy_score(Y_test,y_pred)

0.9801980198019802

#1 With more trees the model gives more accuracy