In [13]:
import csv
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from joblib import dump

In [None]:
INPUT_PATH = "tabular-playground-series-aug-2022"
# reading csv files (train & test data)
x_train = pd.read_csv(f'{INPUT_PATH}/train.csv')
x_train = x_train.reset_index().set_index('id').drop('index', axis=1)

# creating the missing value flags
x_train['m3_miss_flag'] = x_train.measurement_3.isna()
x_train['m5_miss_flag'] = x_train.measurement_5.isna()
x_train["m3_miss_flag"] = x_train["m3_miss_flag"].astype(int)
x_train["m5_miss_flag"] = x_train["m5_miss_flag"].astype(int)

# filling the missing values (using IterativeImputer)
features = [f for f in x_train.columns if f == 'loading' or (f.startswith('measurement') and f != 'measurement_17')]
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit(x_train[features])
x_train[features] = imputer.transform(x_train[features])

# filling the missing values of measurement_17 by LinearRegression
relative_columns = ['measurement_3', 'measurement_4', 'measurement_5','measurement_6', 'measurement_7', 'measurement_8', 'measurement_9'] # columns related to measurement_17
x_train['pred_m_17'] = ""
for product in "A", "B", "C", "D", "E":
    train_source = x_train.loc[(x_train['product_code'] == product) & (x_train.measurement_17.isna() == False)]
    m17_lr = LinearRegression().fit(train_source[relative_columns], train_source['measurement_17'])
    predict_source = x_train.loc[(x_train['product_code'] == product)]
    x_train.loc[(x_train['product_code'] == product), 'pred_m_17'] = m17_lr.predict(predict_source[relative_columns])
    for index, row in x_train.iterrows():
        if row['product_code'] == product and math.isnan(row['measurement_17']):
            x_train.loc[index, 'measurement_17'] = x_train.loc[index, 'pred_m_17']

# spliting x & y
y_train = x_train.failure
x_train = x_train.drop(['failure'],axis=1)

In [15]:
# PCA : Principal Component Analysis
pca_target = [
    'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7',
    'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12',
    'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'
]
pca = PCA(n_components=1)
x_train['pca'] = pca.fit_transform(x_train[pca_target])

# calculating area
x_train['area'] = x_train['attribute_2'] * x_train['attribute_3']

# removing some columns with high distribution mismatch(trying)
x_train = x_train.drop(['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7',
                      'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12',
                      'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'], axis=1)
x_train = x_train.drop(['product_code', 'attribute_3', 'attribute_2', 'attribute_1', 'attribute_0'], axis=1)

# standardization
standardize_columns = ['loading', 'measurement_0', 'measurement_1','measurement_2', 'measurement_17', 'm3_miss_flag', 'm5_miss_flag', 'pca', 'area']
x_train[standardize_columns] = StandardScaler().fit_transform(x_train[standardize_columns])

# print(x_train.head())

In [None]:
# LogisticRegression model
model = LogisticRegression(penalty='l1', C=0.01, solver='liblinear')
model.fit(x_train, y_train)
dump(model, 'weights.joblib') 