In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib
import seaborn as sns

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, RepeatedStratifiedKFold
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier, XGBRegressor

In [None]:
df = pd.read_csv("/kaggle/input/playground-series-s3e12/train.csv", index_col='id').reset_index(drop=True)
df_test = pd.read_csv("/kaggle/input/playground-series-s3e12/test.csv", index_col='id').reset_index(drop=True)

print(df.shape)
df.head()

In [None]:
df.describe()

In [None]:
X = df.drop('target', axis=1).copy()
y = df['target']

model = DecisionTreeClassifier(max_depth=1)
model.fit(X, y)
plot_tree(model, feature_names=X.columns)

In [None]:
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=2023)

valid_preds_1 = []
calcs_1 = []
valid_targets_1 = []
test_preds_1 = []
losses = []

rate = 0.0

features = ['gravity', 'cond', 'calc']
params = {
    'n_estimators': 5000,
    'learning_rate': 0.005,
    'max_depth': 3,
    'random_state': 2023,
    'colsample_bytree': 0.8,
    'subsample': 0.7,
    'early_stopping_rounds': 100,
}
for train_ind, valid_ind in kfold.split(X=df, y=df.target):
    X_test = df_test.copy()
    X_train = df.iloc[train_ind]
    X_valid = df.iloc[valid_ind]
    y_train = X_train.pop('target')
    y_valid = X_valid.pop('target')
    
    X_train = X_train[features]
    X_valid = X_valid[features]
    X_test = X_test[features]
    
    X_train.loc[X_train.calc >= 9.5, 'calc'] = np.nan
    X_valid.loc[X_valid.calc >= 9.5, 'calc'] = np.nan
    X_test.loc[X_test.calc >= 9.5, 'calc'] = np.nan
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=500)
    
    feature_importances = model.feature_importances_
    valid_preds = model.predict_proba(X_valid)[:, 1]
    train_preds = model.predict_proba(X_train)[:, 1]
    valid_loss = roc_auc_score(y_valid, valid_preds)
    train_loss = roc_auc_score(y_train, train_preds)
    
    rate = abs(train_loss - valid_loss)
    
    k = 0
    indexes_to_fillna = X_train[features[k]].sample(frac=rate*feature_importances[k], random_state=2023).index
    X_train.loc[indexes_to_fillna, features[k]] = np.nan
    
    k = 1
    indexes_to_fillna = X_train[features[k]].sample(frac=rate*feature_importances[k], random_state=2023).index
    X_train.loc[indexes_to_fillna, features[k]] = np.nan
    
    k = 2
    indexes_to_fillna = X_train[features[k]].sample(frac=rate*feature_importances[k], random_state=2023).index
    X_train.loc[indexes_to_fillna, features[k]] = np.nan
    
    print("Rate:", rate)
    print(f"Previous Importance of {features[0]}: {feature_importances[0]}")
    print(f"Previous Importance of {features[1]}: {feature_importances[1]}")
    print(f"Previous Importance of {features[2]}: {feature_importances[2]}")
    print("Previous Loss:", valid_loss)
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=500)
    
    feature_importances = model.feature_importances_
    valid_preds = model.predict_proba(X_valid)[:, 1]
    valid_preds_1 += list(valid_preds)
    valid_targets_1 += list(y_valid)
    
    test_preds = model.predict_proba(X_test)[:, 1]
    test_preds_1.append(test_preds)
    
    loss = roc_auc_score(y_valid, valid_preds)
    losses.append(loss)
    rate = 1 - loss
    
    calcs_1.append(X_train.calc)
    print(f"Importance of {features[0]}: {feature_importances[0]}")
    print(f"Importance of {features[1]}: {feature_importances[1]}")
    print(f"Previous Importance of {features[2]}: {feature_importances[2]}")
    print("Loss:", loss)
    
print(np.mean(losses))

model.predict(X_train) == y_train

In [None]:
nan

In [None]:
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=2023)

valid_preds_2 = []
valid_targets_2 = []
test_preds_2 = []
losses = []

rate = 0.0

features = ['osmo', 'cond', 'gravity']
params = {
    'n_estimators': 5000,
    'learning_rate': 0.005,
    'max_depth': 3,
    'random_state': 2023,
    'early_stopping_rounds': 100,
}
for train_ind, valid_ind in kfold.split(X=df, y=df.target):
    X_test = df_test.copy()
    X_train = df.iloc[train_ind]
    X_valid = df.iloc[valid_ind]
    y_train = X_train.pop('target')
    y_valid = X_valid.pop('target')
    
    X_train = X_train[features]
    X_valid = X_valid[features]
    X_test = X_test[features]
    
    X_train.loc[X_train.osmo >= 921, 'osmo'] = np.nan
    X_valid.loc[X_valid.osmo >= 921, 'osmo'] = np.nan
    X_test.loc[X_test.osmo >= 921, 'osmo'] = np.nan
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=500)
    
    valid_preds = model.predict_proba(X_valid)[:, 1]
    train_preds = model.predict_proba(X_train)[:, 1]
    valid_loss = roc_auc_score(y_valid, valid_preds)
    train_loss = roc_auc_score(y_train, train_preds)
    
    rate = abs(train_loss - valid_loss)
    
    k = 0
    feature_importances = model.feature_importances_
    indexes_to_fillna = X_train[features[k]].sample(frac=rate*feature_importances[k], random_state=2023).index
    X_train.loc[indexes_to_fillna, features[k]] = np.nan
    
    k = 1
    indexes_to_fillna = X_train[features[k]].sample(frac=rate*feature_importances[k], random_state=2023).index
    X_train.loc[indexes_to_fillna, features[k]] = np.nan
    
    print("Rate:", rate)
    print(f"Previous Importance of {features[0]}: {feature_importances[0]}")
    print(f"Previous Importance of {features[1]}: {feature_importances[1]}")
    print("Previous Loss:", valid_loss)
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=500)
    
    feature_importances = model.feature_importances_
    valid_preds = model.predict_proba(X_valid)[:, 1]
    valid_preds_2 += list(valid_preds)
    valid_targets_2 += list(y_valid)
    
    test_preds = model.predict_proba(X_test)[:, 1]
    test_preds_2.append(test_preds)
    
    loss = roc_auc_score(y_valid, valid_preds)
    losses.append(loss)
    rate = 1 - loss
    
    print(f"Importance of {features[0]}: {feature_importances[0]}")
    print(f"Importance of {features[1]}: {feature_importances[1]}")
    print("Loss:", loss)
    
print(np.mean(losses))

model.predict(X_train) == y_train

In [None]:
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=2023)

valid_preds_3 = []
valid_targets_3 = []
test_preds_3 = []
losses = []

rate = 0.0

features = ['gravity', 'osmo', 'calc']
params = {
    'n_estimators': 5000,
    'learning_rate': 0.005,
    'max_depth': 3,
    'random_state': 2023,
    'early_stopping_rounds': 100,
}
for train_ind, valid_ind in kfold.split(X=df, y=df.target):
    X_test = df_test.copy()
    X_train = df.iloc[train_ind]
    X_valid = df.iloc[valid_ind]
    y_train = X_train.pop('target')
    y_valid = X_valid.pop('target')
    
    X_train = X_train[features]
    X_valid = X_valid[features]
    X_test = X_test[features]
    
    X_train.loc[X_train.osmo >= 921, 'osmo'] = np.nan
    X_valid.loc[X_valid.osmo >= 921, 'osmo'] = np.nan
    X_test.loc[X_test.osmo >= 921, 'osmo'] = np.nan
    
    X_train.loc[X_train.calc >= 9, 'calc'] = np.nan
    X_valid.loc[X_valid.calc >= 9, 'calc'] = np.nan
    X_test.loc[X_test.calc >= 9, 'calc'] = np.nan
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=500)
    
    valid_preds = model.predict_proba(X_valid)[:, 1]
    train_preds = model.predict_proba(X_train)[:, 1]
    valid_loss = roc_auc_score(y_valid, valid_preds)
    train_loss = roc_auc_score(y_train, train_preds)
    
    rate = abs(train_loss - valid_loss)
    
    k = 0
    feature_importances = model.feature_importances_
    indexes_to_fillna = X_train[features[k]].sample(frac=rate*feature_importances[k], random_state=2023).index
    X_train.loc[indexes_to_fillna, features[k]] = np.nan
    
    k = 1
    indexes_to_fillna = X_train[features[k]].sample(frac=rate*feature_importances[k], random_state=2023).index
    X_train.loc[indexes_to_fillna, features[k]] = np.nan
    
    print("Rate:", rate)
    print(f"Previous Importance of {features[0]}: {feature_importances[0]}")
    print(f"Previous Importance of {features[1]}: {feature_importances[1]}")
    print("Previous Loss:", valid_loss)
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], verbose=500)
    
    feature_importances = model.feature_importances_
    valid_preds = model.predict_proba(X_valid)[:, 1]
    valid_preds_3 += list(valid_preds)
    valid_targets_3 += list(y_valid)
    
    test_preds = model.predict_proba(X_test)[:, 1]
    test_preds_3.append(test_preds)
    
    loss = roc_auc_score(y_valid, valid_preds)
    losses.append(loss)
    rate = 1 - loss
    
    print(f"Importance of {features[0]}: {feature_importances[0]}")
    print(f"Importance of {features[1]}: {feature_importances[1]}")
    print("Loss:", loss)
    
print(np.mean(losses))

In [None]:
valid_preds_df = pd.DataFrame({
    'preds_1': valid_preds_1,
    'preds_2': valid_preds_2,
    'preds_3': valid_preds_3,
    'target': valid_targets_3,
})

test_preds_df = pd.DataFrame({
    'preds_1': np.mean(np.column_stack(test_preds_1), axis=1),
    'preds_2': np.mean(np.column_stack(test_preds_2), axis=1),
    'preds_3': np.mean(np.column_stack(test_preds_3), axis=1),
})

In [None]:
valid_preds_df.describe()

In [None]:
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=10, random_state=2023)

test_preds = np.zeros((10*10, len(test_preds_df)))

k = 0
losses = []
for train_ind, valid_ind in kfold.split(X=valid_preds_df, y=valid_preds_df.target):
    X_train = valid_preds_df.iloc[train_ind]
    X_valid = valid_preds_df.iloc[valid_ind]
    y_train = X_train.pop('target')
    y_valid = X_valid.pop('target')
    
    model = LogisticRegression(C=2)
    model.fit(X_train, y_train)
    
    test_pred = model.predict_proba(test_preds_df)[:, 1]
    test_preds[k] = test_pred
    
    valid_preds = model.predict_proba(X_valid)[:, 1]
    losses.append(roc_auc_score(y_valid, valid_preds))
    print(losses[-1])
    k += 1
    
print(np.mean(losses))

In [None]:
np.mean(losses)

In [None]:
from eli5.sklearn import PermutationImportance
import eli5

perm = PermutationImportance(model).fit(X_valid, y_valid)
eli5.show_weights(perm, feature_names=X_train.columns.tolist())

In [None]:
model.coef_

In [None]:
ss = pd.read_csv("/kaggle/input/playground-series-s3e12/sample_submission.csv")

ss.target = np.mean(np.column_stack(test_preds), axis=1)
ss.head()

In [None]:
ss.target.hist(bins=40, rwidth=0.8)

In [None]:
ss.to_csv("submission.csv", index=False)

In [None]:
nan