In [18]:
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from numpy import loadtxt
train_data = pd.read_csv("./titanic/train.csv")
test_data = pd.read_csv("./titanic/test.csv")

print(train_data.head())
print(test_data.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [19]:
# Create copies to avoid modifying original dataframes, making the cell re-runnable
train_df = train_data.copy()
test_df = test_data.copy()

y_train = train_df['Survived']
train_df.drop(columns=['Survived'], axis=1, inplace=True)

full_data = pd.concat([train_df, test_df], axis=0, sort=False)

drop_columns = ["Name", "Age", "SibSp", "Ticket", "Cabin", "Parch", "Embarked"]
full_data.drop(columns=drop_columns, axis=1, inplace=True)
full_data = pd.get_dummies(full_data, columns=["Sex", "Pclass"])
full_data.fillna(full_data.mean(), inplace=True)

X_train = full_data[:len(train_data)]
X_test = full_data[len(train_data):]

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

test_size = 0.3
state = 42

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=state)

In [20]:
lr_list = [0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

for lr in lr_list:
    gb_clf = GradientBoostingClassifier(learning_rate=lr, max_depth=2, n_estimators=20)
    gb_clf.fit(X_train, y_train)
    y_pred = gb_clf.predict(X_val)
    
    print("Learning rate: ", lr)
    print("Accuracy: ", accuracy_score(y_val, y_pred))
    print("Classification report: ", classification_report(y_val, y_pred))
    print("Confusion matrix: ", confusion_matrix(y_val, y_pred))
    print("\n")

Learning rate:  0.01
Accuracy:  0.585820895522388
Classification report:                precision    recall  f1-score   support

           0       0.59      1.00      0.74       157
           1       0.00      0.00      0.00       111

    accuracy                           0.59       268
   macro avg       0.29      0.50      0.37       268
weighted avg       0.34      0.59      0.43       268

Confusion matrix:  [[157   0]
 [111   0]]


Learning rate:  0.1
Accuracy:  0.7723880597014925
Classification report:                precision    recall  f1-score   support

           0       0.73      0.96      0.83       157
           1       0.90      0.50      0.65       111

    accuracy                           0.77       268
   macro avg       0.82      0.73      0.74       268
weighted avg       0.80      0.77      0.76       268

Confusion matrix:  [[151   6]
 [ 55  56]]


Learning rate:  0.2
Accuracy:  0.7835820895522388
Classification report:                precision    recall  f

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [21]:
gb_clf2 = GradientBoostingClassifier(learning_rate=0.5, max_depth=2, n_estimators=20)
gb_clf2.fit(X_train, y_train)
y_pred = gb_clf2.predict(X_val)

print("Confusion matrix: ", confusion_matrix(y_val, y_pred))
print("Classification report: ", classification_report(y_val, y_pred))

Confusion matrix:  [[146  11]
 [ 41  70]]
Classification report:                precision    recall  f1-score   support

           0       0.78      0.93      0.85       157
           1       0.86      0.63      0.73       111

    accuracy                           0.81       268
   macro avg       0.82      0.78      0.79       268
weighted avg       0.82      0.81      0.80       268



In [22]:
from xgboost import XGBClassifier
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train, y_train)
score = xgb_clf.score(X_val, y_val)
print("Score: ", score)

Score:  0.7910447761194029


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_val_score
from numpy import loadtxt

dataset = loadtxt('./diabetes.csv', delimiter=',')

X = dataset[:, 0:8]
Y = dataset[:, 8]

model = XGBClassifier()
kfold = KFold(n_splits=10)
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: %.2f%%" % (results.mean() * 100.0))

Accuracy: 74.35%


In [26]:
stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True)
results = cross_val_score(model, X, Y, cv=stratified_kfold)
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 75.12% (3.25%)
