In [None]:
import csv
import cv2
import os
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

### Preprocess the data

In [None]:
INPUT_PATH = "/kaggle/input/tabular-playground-series-aug-2022"
# reading csv files (train & test data)
x_train = pd.read_csv(f'{INPUT_PATH}/train.csv')
x_train = x_train.reset_index().set_index('id').drop('index', axis=1)

x_test = pd.read_csv(f'{INPUT_PATH}/test.csv')
x_test = x_test.reset_index().set_index('id').drop('index', axis=1)

# creating the missing value flags
x_train['m3_miss_flag'] = x_train.measurement_3.isna()
x_train['m5_miss_flag'] = x_train.measurement_5.isna()
x_train["m3_miss_flag"] = x_train["m3_miss_flag"].astype(int)
x_train["m5_miss_flag"] = x_train["m5_miss_flag"].astype(int)

x_test['m3_miss_flag'] = x_test.measurement_3.isna()
x_test['m5_miss_flag'] = x_test.measurement_5.isna()
x_test["m3_miss_flag"] = x_test["m3_miss_flag"].astype(int)
x_test["m5_miss_flag"] = x_test["m5_miss_flag"].astype(int)

# filling the missing values (using SimpleImupter)
features = [f for f in x_train.columns if f == 'loading' or (f.startswith('measurement') and f != 'measurement_17')]
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(x_train[features])
x_train[features] = imputer.transform(x_train[features])

features = [f for f in x_test.columns if f == 'loading' or (f.startswith('measurement') and f != 'measurement_17')]
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(x_test[features])
x_test[features] = imputer.transform(x_test[features])

# filling the missing values of measurement_17 by LinearRegression
relative_columns = ['measurement_3', 'measurement_4', 'measurement_5','measurement_6', 'measurement_7', 'measurement_8', 'measurement_9'] # columns related to measurement_17
x_train['pred_m_17'] = ""
for product in "A", "B", "C", "D", "E":
    train_source = x_train.loc[(x_train['product_code'] == product) & (x_train.measurement_17.isna() == False)]
    m17_lr = LinearRegression().fit(train_source[relative_columns], train_source['measurement_17'])
    predict_source = x_train.loc[(x_train['product_code'] == product)]
    x_train.loc[(x_train['product_code'] == product), 'pred_m_17'] = m17_lr.predict(predict_source[relative_columns])
    for index, row in x_train.iterrows():
        if row['product_code'] == product and math.isnan(row['measurement_17']):
            x_train.loc[index, 'measurement_17'] = x_train.loc[index, 'pred_m_17']

x_test['pred_m_17'] = ""
for product in "F", "G", "H", "I":
    train_source = x_test.loc[(x_test['product_code'] == product) & (x_test.measurement_17.isna() == False)]
    m17_lr = LinearRegression().fit(train_source[relative_columns], train_source['measurement_17'])
    predict_source = x_test.loc[(x_test['product_code'] == product)]
    x_test.loc[(x_test['product_code'] == product), 'pred_m_17'] = m17_lr.predict(predict_source[relative_columns])
    for index, row in x_test.iterrows():
        if row['product_code'] == product and math.isnan(row['measurement_17']):
            x_test.loc[index, 'measurement_17'] = x_test.loc[index, 'pred_m_17']

# spliting x & y
y_train = x_train.failure
x_train = x_train.drop(['failure'],axis=1)

In [None]:
# PCA : Principal Component Analysis
pca_target = [
    'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7',
    'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12',
    'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'
]
pca = PCA(n_components=1)
x_train['pca'] = pca.fit_transform(x_train[pca_target])

pca = PCA(n_components=1)
x_test['pca'] = pca.fit_transform(x_test[pca_target])

# calculating area
x_train['area'] = x_train['attribute_2'] * x_train['attribute_3']
x_test['area'] = x_test['attribute_2'] * x_test['attribute_3']

# removing some columns with high distribution mismatch
x_train = x_train.drop(['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7',
                      'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12',
                      'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'], axis=1)
x_train = x_train.drop(['product_code', 'attribute_3', 'attribute_2', 'attribute_1', 'attribute_0'], axis=1)

x_test = x_test.drop(['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7',
                      'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12',
                      'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'], axis=1)
x_test = x_test.drop(['product_code', 'attribute_3', 'attribute_2', 'attribute_1', 'attribute_0'], axis=1)

# standardization
standardize_columns = ['loading', 'measurement_0', 'measurement_1','measurement_2', 'measurement_17', 'm3_miss_flag', 'm5_miss_flag', 'pca', 'area']
x_train[standardize_columns] = StandardScaler().fit_transform(x_train[standardize_columns])
x_test[standardize_columns] = StandardScaler().fit_transform(x_test[standardize_columns])

print(x_train.head())

### LogisticRegression, KNN, Decision Tree

In [None]:
# change the amounts of train data
# x_train, _, y_train, _ = train_test_split(x_train, y_train, train_size=0.5, random_state=42)

# logisticRegression model
model = LogisticRegression(penalty='l1', C=0.01, solver='liblinear')

# KNN model
# model = KNeighborsClassifier(n_neighbors=5)

# DesisionTree
# model = DecisionTreeClassifier(max_depth=5, random_state=42)

# cross validation
cv_results = cross_validate(model, x_train, y_train, cv=5, scoring=['accuracy', 'roc_auc', 'precision', 'recall'])

# print scores
for i in range(5):
    print(f"Fold {i+1} - Accuracy: {cv_results['test_accuracy'][i]:.3f}, AUROC: {cv_results['test_roc_auc'][i]:.3f}, Precision: {cv_results['test_precision'][i]:.3f}, Recall: {cv_results['test_recall'][i]:.3f}")
print(f"\nValidation: Average Accuracy: {cv_results['test_accuracy'].mean():.3f}, Average AUROC score: {cv_results['test_roc_auc'].mean():.3f}, Average Precision: {cv_results['test_precision'].mean():.3f}, , Average Recall: {cv_results['test_recall'].mean():.3f}\n")

# train model on all training data
model.fit(x_train, y_train)

# predict test data
y_pred = model.predict_proba(x_test)[:,1]

In [None]:
# saving result
if os.path.exists('submission.csv'):
    os.remove("/kaggle/working/submission.csv")
submission = pd.DataFrame({'id': x_test.index, 'failure': y_pred})
submission.to_csv('submission.csv', index=False)