In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Random forest is nothing but a bag of decision trees

In [None]:
bag = BaggingClassifier(
        DecisionTreeClassifier(),  # Classifier
        n_estimators=1000,         # Number of instances of the estimator
        max_samples=100,           # Maximum samples to take
        bootstrap=True,            # Bagging mode set
        random_state=55    
)
bag.fit(X_train, y_train)
bag_predictions = bag.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, bag_predictions))

### Random Forest using RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(
    n_estimators = 1000,
    max_leaf_nodes=16,
    oob_score=True,
    max_depth=5,
    criterion='gini',
    random_state=50
)

In [None]:
rf.fit(X_train, y_train)

In [None]:
rf_predictions = rf.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, rf_predictions))

In [None]:
rf.oob_score_

In [None]:
import numpy as np
np.sum(bag_predictions == rf_predictions)/len(bag_predictions)

### Feature Importance

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
rf_iris = RandomForestClassifier(n_estimators=500, random_state=90)
rf_iris.fit(iris['data'], iris['target'])

In [None]:
iris["feature_names"]

In [None]:
rf_iris.feature_importances_

In [None]:
list(zip(iris["feature_names"], rf_iris.feature_importances_))

In [None]:
for feature, score in zip(iris["feature_names"], rf_iris.feature_importances_):
    print(feature, ' ------> ', score*100, '%')

### AdaBoost

In [None]:
from sklearn.datasets import load_iris
iris = load_iris()

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [None]:
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
train_test_split

In [None]:
ab = AdaBoostClassifier( n_estimators=50, learning_rate=1)

In [None]:
model = ab.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

In [None]:
from sklearn.svm import SVC
svc = SVC(probability=True)
ab = AdaBoostClassifier( n_estimators=50, learning_rate=1, base_estimator=svc)

In [None]:
model = ab.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, predictions))

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing

In [None]:
# Load the dataset
pima = pd.read_csv(r'C:\Users\Purushotham\Desktop\deloitte\2021\machinelearning\datasets\diabetes.csv')
pima.head()

In [None]:
X = pima.drop('Outcome', axis = 1)
y = pima['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
scaler = preprocessing.StandardScaler().fit(X_train)
X_train_transformed = scaler.transform(X_train)
X_test_transform = scaler.transform(X_test)

In [None]:
gbc = GradientBoostingClassifier(
            n_estimators = 1500,
            learning_rate=0.008,
            random_state=78,
            max_features=8,
            max_depth=5
)
gbc.fit(X_train_transformed, y_train)
predictions = gbc.predict(X_test_transform)

In [None]:
print(confusion_matrix(y_test, predictions))

In [None]:
print(accuracy_score(y_test, predictions))

### XGBoost

In [None]:
%pip install xgboost

In [None]:
import xgboost as xgb

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.keys())

In [None]:
print(boston.feature_names)

In [None]:
print(boston.DESCR)

In [None]:
data = pd.DataFrame(boston.data)

In [None]:
data.columns = boston.feature_names

In [None]:
data.head()

In [None]:
data['PRICE'] = boston.target

In [None]:
data.head()

In [None]:
X = data.drop('PRICE', axis=1)
y = data['PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
xgb_model = xgb.XGBRegressor(
    n_estimators = 100,
    max_depth    = 5,
    learning_rate = 1
)
xgb_model.fit(X_train, y_train)
predictions = xgb_model.predict(X_test)

In [None]:
mean_squared_error(y_test, predictions)

In [None]:
np.sqrt(mean_squared_error(y_test, predictions))

In [None]:
y_test

In [None]:
predictions

### k-fold Cross Validation on XGBoost

In [None]:
params = {"objective":"reg:linear",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10}



In [None]:
data_dmatrix = xgb.DMatrix(data=X,label=y)
cv_results = xgb.cv(
    dtrain=data_dmatrix, 
    params=params, 
    nfold=3,
    num_boost_round=50,
    early_stopping_rounds=10,
    metrics="rmse", 
    as_pandas=True, 
    seed=123
)

In [None]:
cv_results.head()

### Visualizing XGBoost Trees and Feature Importance

In [None]:
xg_reg = xgb.train(params=params, dtrain=data_dmatrix, num_boost_round=10)

In [None]:
import matplotlib.pyplot as plt

xgb.plot_tree(xg_reg,num_trees=0)
plt.rcParams['figure.figsize'] = [80, 30]
plt.show()

In [None]:
xgb.plot_importance(xg_reg)
plt.rcParams['figure.figsize'] = [50, 5]
plt.show()

### Hyperparameter Tuning using Scikit-Learn's Grid Search and Randomized Search

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split

In [None]:
from sklearn.ensemble import RandomForestRegressor
import numpy as np

In [None]:
diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
gs = GridSearchCV(
        estimator=RandomForestRegressor(),
        param_grid={
            'n_estimators':np.arange(5,100,5),
            'max_features':np.arange(1, 10, 1)
        },
        n_jobs=1,
        cv=5,
        scoring='r2',
        verbose=5
)
gs.fit(X_train, y_train)

In [None]:
gs.best_params_

In [None]:
rs = RandomizedSearchCV(
        estimator=RandomForestRegressor(),
        param_distributions={
            'n_estimators':np.arange(5,100,5),
            'max_features':np.arange(1, 10, 1)
        },
        n_jobs=1,
        cv=5,
        scoring='r2',
        verbose=5
)
rs.fit(X_train, y_train)

In [None]:
rs.best_params_

### Learning Curve

In [None]:
import pandas as pd
electricity = pd.read_csv(r'C:\Users\Purushotham\Desktop\deloitte\2021\machinelearning\datasets\electricity.csv')
print(electricity.info())
electricity.head(3)

In [None]:
electricity.shape

In [None]:
train_sizes = [1, 100, 500, 2000, 5000, 7654]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import learning_curve
features = ['AT', 'V', 'AP', 'RH']
target = 'PE'
train_sizes, train_scores, validation_scores = learning_curve(
                                                                estimator = LinearRegression(),
                                                                X = electricity[features],
                                                                y = electricity[target], train_sizes = train_sizes, cv = 5,
                                                                scoring = 'neg_mean_squared_error'
)

In [None]:
print('Training scores:\n\n', train_scores)
print('\n', '-' * 70) # separator to make the output easy to read
print('\nValidation scores:\n\n', validation_scores)

In [None]:
train_scores_mean = -train_scores.mean(axis = 1)
validation_scores_mean = -validation_scores.mean(axis = 1)

In [None]:
import matplotlib.pyplot as plt

plt.style.use('seaborn')
plt.plot(train_sizes, train_scores_mean, label = 'Training error')
plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')
plt.ylabel('MSE', fontsize = 14)
plt.xlabel('Training set size', fontsize = 14)
plt.title('Learning curves for a linear regression model', fontsize = 18, y = 1.03)
plt.legend()
plt.ylim(0,40)

In [None]:
### Bundling our previous work into a function ###
def learning_curves(estimator, data, features, target, train_sizes, cv):
    train_sizes, train_scores, validation_scores = learning_curve(
    estimator, data[features], data[target], train_sizes =
    train_sizes,
    cv = cv, scoring = 'neg_mean_squared_error')
    train_scores_mean = -train_scores.mean(axis = 1)
    validation_scores_mean = -validation_scores.mean(axis = 1)

    plt.plot(train_sizes, train_scores_mean, label = 'Training error')
    plt.plot(train_sizes, validation_scores_mean, label = 'Validation error')

    plt.ylabel('MSE', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    title = 'Learning curves for a ' + str(estimator).split('(')[0] + ' model'
    plt.title(title, fontsize = 18, y = 1.03)
    plt.legend()
    plt.ylim(0,40)

In [None]:
from sklearn.ensemble import RandomForestRegressor

plt.figure(figsize = (16,5))

for model, i in [(RandomForestRegressor(), 1), (LinearRegression(),2)]:
    plt.subplot(1,2,i)
    learning_curves(model, electricity, features, target, train_sizes, 5)