In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import VotingClassifier, VotingRegressor, BaggingClassifier, BaggingRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, Ridge, ElasticNet
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.metrics import precision_score, accuracy_score, f1_score, mean_squared_error, mean_absolute_error
from tqdm import tqdm
import os
os.chdir("D:\\meridianthe4\\PML\\Cases")

In [6]:
hr = pd.read_csv("HR_comma_sep.csv")
X, y = hr.drop("left", axis=1), hr["left"]
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform="pandas")
col_transformer = ColumnTransformer([("OHE", ohe, make_column_selector(dtype_include=object))], 
                                    remainder='passthrough', 
                                    verbose_feature_names_out=False)
col_transformer = col_transformer.set_output(transform="pandas")
X = col_transformer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [25]:
hr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14995 entries, 0 to 14994
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   satisfaction_level     14995 non-null  float64
 1   last_evaluation        14995 non-null  float64
 2   number_project         14995 non-null  int64  
 3   average_montly_hours   14995 non-null  int64  
 4   time_spend_company     14995 non-null  int64  
 5   Work_accident          14995 non-null  int64  
 6   left                   14995 non-null  int64  
 7   promotion_last_5years  14995 non-null  int64  
 8   Department             14995 non-null  object 
 9   salary                 14995 non-null  object 
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


In [7]:
dtc1 = DecisionTreeClassifier(random_state=25, max_depth=3)
dtc2 = DecisionTreeClassifier(random_state=25, max_depth=2)
lr = LogisticRegression(max_iter=5000)
bagg = BaggingClassifier(random_state=25, estimator=lr, n_estimators=15)
bagg.fit(X_train, y_train)
y_pred_bagg = bagg.predict(X_test)
accuracy_bagg = accuracy_score(y_test, y_pred_bagg)
print("Accuracy (Bagging Classifier):", accuracy_bagg)

Accuracy (Bagging Classifier): 0.7899533229606579


In [9]:
dtc1 = DecisionTreeClassifier(random_state=25, max_depth=3)
dtc2 = DecisionTreeClassifier(random_state=25, max_depth=2)
lr = LogisticRegression(max_iter=5000)
bagg = BaggingClassifier(random_state=25, estimator=dtc1, n_estimators=15)
bagg.fit(X_train, y_train)
y_pred_bagg = bagg.predict(X_test)
accuracy_bagg = accuracy_score(y_test, y_pred_bagg)
print("Accuracy (Bagging Classifier):", accuracy_bagg)

Accuracy (Bagging Classifier): 0.9546565903534119


In [10]:
dtc1 = DecisionTreeClassifier(random_state=25, max_depth=3)
dtc2 = DecisionTreeClassifier(random_state=25, max_depth=2)
lr = LogisticRegression(max_iter=5000)
bagg = BaggingClassifier(random_state=25, estimator=dtc2, n_estimators=15)
bagg.fit(X_train, y_train)
y_pred_bagg = bagg.predict(X_test)
accuracy_bagg = accuracy_score(y_test, y_pred_bagg)
print("Accuracy (Bagging Classifier):", accuracy_bagg)

Accuracy (Bagging Classifier): 0.8437430540120027


In [11]:
sc = StandardScaler()
svm = SVC(kernel='rbf')
svm_pipe = Pipeline([('scaler', sc), ('svm', svm)])
bagg = BaggingClassifier(random_state=25, estimator=svm_pipe, n_estimators=15)
bagg.fit(X_train, y_train)
y_pred_bagg = bagg.predict(X_test)
accuracy_bagg = accuracy_score(y_test, y_pred_bagg)
print("Accuracy (Bagging Classifier):", accuracy_bagg)

Accuracy (Bagging Classifier): 0.953322960657924


In [12]:
sc = StandardScaler()
knn = KNeighborsClassifier(n_neighbors=5)
knn_pipe = Pipeline([('scaler', sc), ('knn', knn)])
bagg = BaggingClassifier(random_state=25, estimator=knn_pipe, n_estimators=15)
bagg.fit(X_train, y_train)
y_pred_bagg = bagg.predict(X_test)
accuracy_bagg = accuracy_score(y_test, y_pred_bagg)
print("Accuracy (Bagging Classifier):", accuracy_bagg)

Accuracy (Bagging Classifier): 0.9433207379417649


In [13]:
dtc1 = DecisionTreeClassifier(random_state=25, max_depth=3)
dtc2 = DecisionTreeClassifier(random_state=25, max_depth=None)
lr = LogisticRegression(max_iter=5000)
sc = StandardScaler()
knn = KNeighborsClassifier(n_neighbors=5)
svm = SVC(kernel='rbf')
knn_pipe = Pipeline([('scaler', sc), ('knn', knn)])
svm_pipe = Pipeline([('scaler', sc), ('svm', svm)])

In [14]:
estimators = [dtc1, dtc2, lr, knn_pipe, svm_pipe]
scores = []
for est in tqdm(estimators):
    bagg = BaggingClassifier(random_state=25, estimator=est, n_estimators=15)
    bagg.fit(X_train, y_train)
    y_pred_bagg = bagg.predict(X_test)
    accuracy_bagg = accuracy_score(y_test, y_pred_bagg)
    scores.append([est, accuracy_bagg])
scores_df = pd.DataFrame(scores, columns=["Estimator", "score"])
scores_df.sort_values("score", ascending=False)

100%|██████████| 5/5 [00:33<00:00,  6.62s/it]


Unnamed: 0,Estimator,score
1,DecisionTreeClassifier(random_state=25),0.988886
0,"DecisionTreeClassifier(max_depth=3, random_sta...",0.954657
4,"(StandardScaler(), SVC())",0.953323
3,"(StandardScaler(), KNeighborsClassifier())",0.943321
2,LogisticRegression(max_iter=5000),0.789953


In [15]:
hr

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.10,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.00,5,224,5,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14990,0.40,0.57,2,151,3,0,1,0,support,low
14991,0.37,0.48,2,160,3,0,1,0,support,low
14992,0.37,0.53,2,143,3,0,1,0,support,low
14993,0.11,0.96,6,280,4,0,1,0,support,low


In [16]:
X

Unnamed: 0,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_low,salary_medium,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.38,0.53,2,157,3,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.80,0.86,5,262,6,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.10,0.77,6,247,4,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.92,0.85,5,259,5,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.89,1.00,5,224,5,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.40,0.57,2,151,3,0,0
14991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.37,0.48,2,160,3,0,0
14992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.37,0.53,2,143,3,0,0
14993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.11,0.96,6,280,4,0,0


In [22]:
ohe.fit_transform(hr[['Department']])

Unnamed: 0,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
14990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14991,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14992,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
14993,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [29]:
hr['Department'].unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [30]:
hr

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.10,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.00,5,224,5,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14990,0.40,0.57,2,151,3,0,1,0,support,low
14991,0.37,0.48,2,160,3,0,1,0,support,low
14992,0.37,0.53,2,143,3,0,1,0,support,low
14993,0.11,0.96,6,280,4,0,1,0,support,low


In [31]:
os.chdir("D:\\meridianthe4\\PML\\Datasets")
boston = pd.read_csv("Boston.csv")
X, y = boston.drop(columns=["medv"]), boston["medv"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=25)

In [None]:
dtr = DecisionTreeRegressor(random_state=25, max_depth=3)
scores = []
for n in [10, 15, 25, 50, 100, 150, 200]:
    bagg = BaggingRegressor(random_state=25, estimator=dtr, n_estimators=n)
    bagg.fit(X_train    , y_train)
    y_pred_bagg = bagg.predict(X_test)
    rmse_bagg = mean_absolute_error(y_test, y_pred_bagg)
    scores.append([n, rmse_bagg])
scores_df = pd.DataFrame(scores, columns=["n_estimators", "score"])
scores_df.sort_values("score")

Unnamed: 0,n_estimators,score
5,150,2.842829
6,200,2.850226
2,25,2.858073
4,100,2.86071
0,10,2.875943
1,15,2.881208
3,50,2.906874
