In [96]:
import numpy as np 
import pandas as pd
import os
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import optuna
from category_encoders import OneHotEncoder, MEstimateEncoder, CatBoostEncoder, OrdinalEncoder
from sklearn import set_config
import category_encoders
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold, train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor, IsolationForest
from sklearn.metrics import roc_auc_score, roc_curve, make_scorer, f1_score, accuracy_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin, clone
from sklearn.preprocessing import FunctionTransformer,StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.preprocessing import PolynomialFeatures
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import auc, roc_auc_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.gaussian_process import GaussianProcessClassifier
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.spatial.distance import squareform
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier, Pool
import warnings
from great_tables import GT, style ,exibble, from_column, loc
from colorama import Style, Fore

sns.set_theme(style = 'white', palette = 'colorblind')
pal = sns.color_palette('colorblind')

pd.set_option('display.max_rows', 100)
set_config(transform_output = 'pandas')
pd.options.mode.chained_assignment = None
warnings.simplefilter(action='ignore', category=FutureWarning)

pd.set_option('display.max_rows', 150)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [34]:
def customStatistic(df: pd.DataFrame(), categoric = False):
    num_cols = list(df._get_numeric_data())
    cat_cols = list(df.drop(num_cols,axis=1))
    if categoric:
        desc = pd.DataFrame(index = list(df[cat_cols]))
        df = df[cat_cols]
    else:
        desc = pd.DataFrame(index = list(df[num_cols]))
        df = df[num_cols]
        desc['skew'] = df[num_cols].skew()
        
    desc['type'] = df.dtypes
    desc['count'] = df.count()
    desc['nunique'] = df.nunique()
    desc['%unique'] = desc['nunique'] /len(df) * 100 
    desc['null'] = df.isnull().sum()
    desc['%null'] = desc['null'] / len(df) * 100
    desc = pd.concat([desc,df.describe().T.drop('count',axis=1)],axis=1)    

    desc = desc.round(2)
    return desc.reset_index().rename(columns={'index':'Column'}).sort_values(by=['type'])

In [35]:
train = pd.read_csv(r'data/train.csv', index_col='id')
test = pd.read_csv(r'data/test.csv', index_col='id')
sub  = pd.read_csv(r'data/sample_submission.csv')

In [36]:
target_categories = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
idx_vals = np.argmax(train[target_categories], axis = 1)

target_vals = []
for i in range(0, len(idx_vals)):
    target_vals.append(target_categories[idx_vals[i]])

train['TARGET'] = target_vals
train = train.drop(target_categories, axis = 1)

In [37]:
stat = customStatistic(train,False)
GT(stat)\
    .tab_header(title='Descriptive Statistic - Train', subtitle='Numeric Fields')\
    .data_color(columns=['min','max','mean'],palette=['lightblue','lightcoral'],alpha=0.5)\
    .fmt_percent(columns=['%unique','%null'])

Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train,Descriptive Statistic - Train
Numeric Fields,Numeric Fields.1,Numeric Fields.2,Numeric Fields.3,Numeric Fields.4,Numeric Fields.5,Numeric Fields.6,Numeric Fields.7,Numeric Fields.8,Numeric Fields.9,Numeric Fields.10,Numeric Fields.11,Numeric Fields.12,Numeric Fields.13,Numeric Fields.14
X_Minimum,0.02,int64,19219,1191,620.00%,0.0,0.00%,709.85,531.54,0.0,49.0,777.0,1152.0,1705.0
TypeOfSteel_A400,-0.39,int64,19219,2,1.00%,0.0,0.00%,0.6,0.49,0.0,0.0,1.0,1.0,1.0
TypeOfSteel_A300,0.4,int64,19219,2,1.00%,0.0,0.00%,0.4,0.49,0.0,0.0,0.0,1.0,1.0
Length_of_Conveyer,0.86,int64,19219,99,52.00%,0.0,0.00%,1459.35,145.57,1227.0,1358.0,1364.0,1652.0,1794.0
Maximum_of_Luminosity,1.17,int64,19219,98,51.00%,0.0,0.00%,128.65,14.2,39.0,124.0,127.0,135.0,253.0
Minimum_of_Luminosity,-0.33,int64,19219,162,84.00%,0.0,0.00%,84.81,28.8,0.0,70.0,90.0,105.0,196.0
Sum_of_Luminosity,6.69,int64,19219,2595,"1,350.00%",0.0,0.00%,191846.68,442024.69,250.0,9848.0,18238.0,67978.0,11591414.0
Steel_Plate_Thickness,2.36,int64,19219,27,14.00%,0.0,0.00%,76.21,53.93,40.0,40.0,69.0,80.0,300.0
X_Perimeter,6.32,int64,19219,460,239.00%,0.0,0.00%,95.65,177.82,2.0,15.0,25.0,64.0,7553.0
Pixels_Areas,6.98,int64,19219,1154,600.00%,0.0,0.00%,1683.99,3730.32,6.0,89.0,168.0,653.0,152655.0


In [38]:
stat = customStatistic(test,False)
GT(stat)\
    .tab_header(title='Descriptive Statistic - Test', subtitle='Numeric Fields')\
    .data_color(columns=['min','max','mean'],palette=['lightblue','lightcoral'],alpha=0.5)\
    .fmt_percent(columns=['%unique','%null'])

Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test,Descriptive Statistic - Test
Numeric Fields,Numeric Fields.1,Numeric Fields.2,Numeric Fields.3,Numeric Fields.4,Numeric Fields.5,Numeric Fields.6,Numeric Fields.7,Numeric Fields.8,Numeric Fields.9,Numeric Fields.10,Numeric Fields.11,Numeric Fields.12,Numeric Fields.13,Numeric Fields.14
X_Minimum,0.01,int64,12814,1131,883.00%,0.0,0.00%,709.33,531.88,0.0,46.0,776.0,1152.0,1688.0
TypeOfSteel_A400,-0.38,int64,12814,2,2.00%,0.0,0.00%,0.59,0.49,0.0,0.0,1.0,1.0,1.0
TypeOfSteel_A300,0.39,int64,12814,2,2.00%,0.0,0.00%,0.41,0.49,0.0,0.0,0.0,1.0,1.0
Length_of_Conveyer,0.85,int64,12814,92,72.00%,0.0,0.00%,1460.14,146.11,1227.0,1358.0,1364.0,1652.0,1727.0
Maximum_of_Luminosity,1.36,int64,12814,99,77.00%,0.0,0.00%,128.69,14.28,37.0,124.0,127.0,135.0,253.0
Minimum_of_Luminosity,-0.33,int64,12814,161,126.00%,0.0,0.00%,84.55,28.71,0.0,68.0,90.0,105.0,203.0
Sum_of_Luminosity,9.06,int64,12814,2300,"1,795.00%",0.0,0.00%,192689.3,481287.15,535.0,9968.0,18402.0,66960.25,11591414.0
Steel_Plate_Thickness,2.4,int64,12814,24,19.00%,0.0,0.00%,75.87,53.49,40.0,40.0,69.0,80.0,300.0
X_Perimeter,19.42,int64,12814,435,339.00%,0.0,0.00%,96.46,219.93,2.0,15.0,24.0,64.0,10449.0
Pixels_Areas,7.6,int64,12814,1070,835.00%,0.0,0.00%,1672.91,3725.52,4.0,91.0,169.0,649.0,152655.0


In [39]:
def plot_numerical():
    #num = train.select_dtypes(include=['int64','float64']).columns

    df = pd.concat([train[NUMERIC_COLS].assign(Source = 'Train'), 
                    test[NUMERIC_COLS].assign(Source = 'Test')], ignore_index = True)

    # Use of more advanced artistic matplotlib interface (see the axes)
    fig, axes = plt.subplots(len(NUMERIC_COLS), 3 ,figsize = (16, len(NUMERIC_COLS) * 4), 
                             gridspec_kw = {'hspace': 0.35, 'wspace': 0.3, 
                                            'width_ratios': [0.80, 0.20, 0.20]})

    for i,col in enumerate(NUMERIC_COLS):
        ax = axes[i,0]
        sns.kdeplot(data = df[[col, 'Source']], x = col, hue = 'Source', palette=['#456cf0', '#ed7647'], linewidth = 2.1, warn_singular=False, ax = ax) # Use of seaborn with artistic interface
        ax.set_title(f"\n{col}",fontsize = 9)
        ax.grid(visible=True, which = 'both', linestyle = '--', color='lightgrey', linewidth = 0.75)
        ax.set(xlabel = '', ylabel = '')

        ax = axes[i,1]
        sns.boxplot(data = df.loc[df.Source == 'Train', [col]], y = col, width = 0.25, linewidth = 0.90, fliersize= 2.25, color = '#456cf0', ax = ax)
        ax.set(xlabel = '', ylabel = '')
        ax.set_title("Train", fontsize = 9)

        ax = axes[i,2]
        sns.boxplot(data = df.loc[df.Source == 'Test', [col]], y = col, width = 0.25, linewidth = 0.90, fliersize= 2.25, color = '#ed7647', ax = ax)
        ax.set(xlabel = '', ylabel = '')
        ax.set_title("Test", fontsize = 9)

    plt.suptitle(f'\nDistribution analysis - numerical features',fontsize = 12, y = 0.89, x = 0.57, fontweight='bold')
    plt.show()

In [51]:
NUMERIC_COLS = train.select_dtypes(exclude='object').columns

# plot_numerical()

In [None]:
# Set number of rows and columns for subplots
num_rows = 6
num_cols = 3

# Create subplots
fig, axes = plt.subplots(num_rows, num_cols, figsize=(30, 30))

# Flatten axes array for easy iteration
axes = axes.flatten()

for i, feature in enumerate(features):
    # Create a new column to mark labels
    df_train['classes'] = train[labels].idxmax(axis=1)
    
    # Set order of hue for consistency
    hue_order = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
    
    # Plot histogram with marked labels
    sns.histplot(data=train, x=feature, hue='classes', multiple='stack', hue_order=hue_order, legend=True, ax=axes[i])
    
    # Set title and labels
    axes[i].set_title(f'Distribution of {feature} with classes')
    axes[i].set_xlabel(feature)
    axes[i].set_ylabel('Count')

# Adjust layout to prevent overlapping of subplots
plt.tight_layout()

# Show the plots
plt.show()

# Drop the temporary column
df_train.drop('classes', axis=1, inplace=True)

# Model Building

In [92]:
target_variables = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']

In [99]:
train = pd.read_csv(r'data/train.csv', index_col='id')
test = pd.read_csv(r'data/test.csv', index_col='id')
df_submission = pd.read_csv('data/sample_submission.csv')


X = train.drop(target_variables, axis=1)
y = train[target_variables]

# Train Test Split
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [102]:
# Define XGBoost parameters
xgboost_params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    #"max_depth": 12,
    "learning_rate": 0.005,
    "subsample": 0.85,
    "colsample_bytree": 0.85,
    "alpha": 0.001,
    "lambda": 0.001,
    "gamma": 0.1,
    "min_child_weight": 5,
    "n_estimators": 1500,
    "random_state": 42
}
df_submission[target_variables] = 0
# Initialize XGBoost classifier
xgboost_model = XGBClassifier(**xgboost_params)

# List to store AUC scores for each target variable
auc_scores = []

# Perform cross-validation for each target variable
for target in target_variables:
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y[target], test_size=0.2, random_state=42)
    
    # Fit the model
    xgboost_model.fit(X_train, y_train)
    
    # Predict probabilities
    y_pred_proba = xgboost_model.predict_proba(X_test)[:, 1]
    
    # Calculate AUC score
    auc_score = roc_auc_score(y_test, y_pred_proba)
    auc_scores.append(auc_score)

    # Add predictions to submission DataFrame
    df_submission[target] += xgboost_model.predict_proba(test)[:, 1]

# Calculate mean AUC score
mean_auc_score = np.mean(auc_scores)

# Print mean AUC score
print("Mean AUC Score (XGBoost):", mean_auc_score)

Mean AUC Score (XGBoost): 0.8882271738632525


In [103]:
# Save submission file
df_submission.to_csv('submission.csv', index=False)