# Random Forest
Random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples
of the dataset and use out-of-bag samples to estimate the generalization error.

To say in simple words: random forest builds multiple decision trees and merges them together to
get more accurate results and stable predictions.

its known for its simplicity and ability to handle high dimensional data.


In [34]:
#impot libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#for random forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score


In [35]:
#load dataset of tips
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [36]:
#lable encoder using for loop
le=LabelEncoder()
for col in df.columns:
    if df[col].dtypes=='object' or df[col].dtypes=='category':
        df[col]=le.fit_transform(df[col])
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.5,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4


In [37]:
#split dataset in features and target for classification

X=df.drop('sex',axis=1)
y=df['sex']


In [38]:
#split dataset in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [39]:
#create the model
model=RandomForestClassifier(n_estimators=200, criterion='entropy', random_state=42, max_depth=5, min_samples_leaf=5)#entropy is used for information gain

#train the model
model.fit(X_train,y_train)

#test the model
y_pred=model.predict(X_test)

#evaluate the model


#confusion matrix
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred)) 

#classification report
print("Classification Report: \n", classification_report(y_test, y_pred))

Confusion Matrix: 
 [[ 5 14]
 [ 3 27]]
Classification Report: 
               precision    recall  f1-score   support

           0       0.62      0.26      0.37        19
           1       0.66      0.90      0.76        30

    accuracy                           0.65        49
   macro avg       0.64      0.58      0.57        49
weighted avg       0.65      0.65      0.61        49



# Regression and Random Forest

In [27]:
#load dataset of tips
df_t=sns.load_dataset('tips')
df_t.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [32]:
df_t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [28]:
#split dataset in features and target of tip
X=df_t.drop('tip',axis=1)
y=df_t['tip']

In [51]:
#split dataset in train and test
from sklearn.model_selection import train_test_split

#import regressor
from sklearn.ensemble import RandomForestRegressor

# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [52]:

# Create the model
rf_model=RandomForestRegressor( n_estimators=200, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

#predict the model
y_pred=rf_model.predict(X_test)


#evaluate the model
print('Mean Absolute Error: \n', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error: \n', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error: \n', np.sqrt(mean_squared_error(y_test, y_pred)))
print( 'R2 Score: \n', r2_score(y_test, y_pred))

Mean Absolute Error: 
 0.4183673469387754
Mean Squared Error: 
 0.2370795918367347
Root Mean Squared Error: 
 0.48690819651833206
R2 Score: 
 0.00135421052631568
