In [1]:
# Implement Random Forest

import seaborn as sns
df = sns.load_dataset("tips")
df.head(2)

# input features and target features
X = df[['total_bill']] #,'tip','size']]  # x1, x2, x3 ...
y = df['smoker']

# encoding
y = df['smoker'].map({'No':0,'Yes':1})

# training
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# modelling Random forest classifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier

clf_rf = RandomForestClassifier(
    n_estimators = 100, # number of trees
    random_state = 42,
    max_depth = 3
)
clf_rf.fit(X_train,y_train)

# predictions
y_pred_rf = clf_rf.predict(X_test)

# classification metrics
cr_rf = classification_report(y_test, y_pred_rf)

accuracy_rf = accuracy_score(y_test, y_pred_rf)

In [None]:
print(cr_rf)

---

In [4]:
# decision tree classifier
from sklearn.tree import DecisionTreeClassifier

clf_dt = DecisionTreeClassifier(
    max_depth = 3,
    random_state = 42
)

clf_dt.fit(X_train,y_train)

# predictions
y_pred_dt = clf_dt.predict(X_test)

# classification metrics
from sklearn.metrics import classification_report
cr_dt = classification_report(y_test, y_pred_dt)
accuracy_dt = accuracy_score(y_test, y_pred_dt)

In [None]:
print(cr_dt)

---

## Bagging 
- Decision Trees Together

In [5]:
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression

base_classifier = DecisionTreeClassifier()
logistic_clf = LogisticRegression()

bagging_classifier = BaggingClassifier(
    clf_dt,
    # base_classifier,
    # logistic_clf, 
    n_estimators = 100, 
    random_state = 42,
    )

bagging_classifier.fit(X_train, y_train)

y_pred_bc = bagging_classifier.predict(X_test)
cr_bc = classification_report(y_test, y_pred_bc)


accuracy_bc = accuracy_score(y_test, y_pred_bc)

In [6]:
print(cr_bc)

              precision    recall  f1-score   support

           0       0.66      0.94      0.77        31
           1       0.60      0.17      0.26        18

    accuracy                           0.65        49
   macro avg       0.63      0.55      0.52        49
weighted avg       0.64      0.65      0.59        49



> ### Exercise
- accuracies of: Random Forest, Bagging with Decision Trees (keep esimator number same, and random state same)




In [7]:
print("accuracy of Random Forest", accuracy_rf)
print("accuracy of Decision Tree", accuracy_dt)
print("accuracy of Bagging Classifier", accuracy_bc)

accuracy of Random Forest 0.6530612244897959
accuracy of Decision Tree 0.5714285714285714
accuracy of Bagging Classifier 0.6530612244897959


> ---

## Boosting
- With AdaBoost and XG Boost

### **AdaBoost** 

- â†’ focuses on misclassified samples by adjusting weights

In [9]:
from sklearn.ensemble import AdaBoostClassifier

base_classifier = DecisionTreeClassifier(max_depth=1)

adaboost_classifier = AdaBoostClassifier(
    base_classifier, 
    n_estimators=50, 
    random_state=42,

    learning_rate = 1.0, # in boosting

)
adaboost_classifier.fit(X_train, y_train)


y_pred_ac = adaboost_classifier.predict(X_test)

cr_ac = classification_report(y_test, y_pred_ac)

In [10]:
print(cr_ac)

              precision    recall  f1-score   support

           0       0.65      0.90      0.76        31
           1       0.50      0.17      0.25        18

    accuracy                           0.63        49
   macro avg       0.58      0.53      0.50        49
weighted avg       0.60      0.63      0.57        49



## XG Boost
- Extreme Gradient Boost 
-  more advanced, with optimizations like regularization and parallelization for speed and accuracy

In [14]:
# pip install xgboost

In [13]:
from xgboost import XGBClassifier

In [15]:
xgb_classifier = XGBClassifier(
    n_estimators=100, 
    learning_rate=0.1, 
    max_depth=3, 
    random_state=42
)
xgb_classifier.fit(X_train, y_train)

y_pred_xgb = bagging_classifier.predict(X_test)
cr_xgb = classification_report(y_test, y_pred_xgb)


In [16]:
print(cr_xgb)

              precision    recall  f1-score   support

           0       0.66      0.94      0.77        31
           1       0.60      0.17      0.26        18

    accuracy                           0.65        49
   macro avg       0.63      0.55      0.52        49
weighted avg       0.64      0.65      0.59        49



## ---**OPTIONAL** ---

### **Regularization**
- Prevents a model from becoming too complex, and memorizing the training data
- also called OVERFITTING
- it kind of adds a penalty to the model if it uses too complex models

- 

- Two Types:
    - Lasso - L1 Regularization - Absolute values of coefficients - Some become exactly zero
    - Ridge - L 2 Regularization  - Squaring values of coefficients - Shrinks but keeps all, never makes any zero
    - Use Ridge when you want to keep all features but control their influence.
    - Use Lasso when you want the model to automatically drop irrelevant features.


> #### Lasso Regularization: 
```py
from sklearn.linear_model import Ridge
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

ridge = Ridge(alpha=1.0)  # alpha controls penalty strength
ridge.fit(X_train, y_train)

print("Ridge coefficients:", ridge.coef_)
print("Ridge score:", ridge.score(X_test, y_test))

```


> #### Ridge Regularization: 

```py

from sklearn.linear_model import Lasso
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split

X, y = load_diabetes(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lasso = Lasso(alpha=0.1)  # smaller alpha = less penalty
lasso.fit(X_train, y_train)

print("Lasso coefficients:", lasso.coef_)
print("Lasso score:", lasso.score(X_test, y_test))




> #### **Pruning**
- Simplifying decision trees
- cut off the parts that are too detailed and only fit the noise in the data
- remove branches that do no improve predictions
```python
model = DecisionTreeClassifier(max_depth=3) # limits depth = pruning