In [6]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

import joblib
import warnings

warnings.filterwarnings("ignore")


In [7]:
train_df = pd.read_csv("data/train.csv", index_col="id")
test_df = pd.read_csv("data/test.csv", index_col="id")
sub_df = pd.read_csv("sample_submission.csv", index_col="id")
print(test_df.shape)


(20775, 24)


In [8]:
train_df = train_df.drop("failure", axis=1)
train_df["product_code"] = train_df["product_code"].map(
    {
        "A": 0, 
        "B": 1, 
        "C": 2, 
        "D": 3, 
        "E": 4, 
        "F": 5, 
        "G": 6, 
        "H": 7, 
        "I": 8
    }
)
train_df["attribute_0"] = train_df["attribute_0"].map(
    {
        "material_5": 0, 
        "material_8": 1
    }
)
train_df["attribute_1"] = train_df["attribute_1"].map(
    {
        "material_5": 0, 
        "material_6": 1, 
        "material_7": 2, 
        "material_8": 3
    }
)

test_df["product_code"] = test_df["product_code"].map(
    {
        "A": 0, 
        "B": 1, 
        "C": 2, 
        "D": 3, 
        "E": 4, 
        "F": 5, 
        "G": 6, 
        "H": 7, 
        "I": 8
    }
)
test_df["attribute_0"] = test_df["attribute_0"].map(
    {
        "material_5": 0, 
        "material_7": 1
    }
)
test_df["attribute_1"] = test_df["attribute_1"].map(
    {
        "material_5": 0, 
        "material_6": 1, 
        "material_7": 2, 
        "material_8": 3
    }
)

imputer = SimpleImputer(strategy="most_frequent")
train_df = pd.DataFrame(
    imputer.fit_transform(train_df),
    columns=train_df.columns,
    index=train_df.index,
)
test_df = pd.DataFrame(
    imputer.fit_transform(test_df),
    columns=test_df.columns,
    index=test_df.index,
)


In [9]:
pca = PCA(n_components=2)
train_df = pca.fit_transform(train_df)
test_df = pca.transform(test_df)


In [10]:
model = joblib.load("LR_model")
y_pred = model.predict(test_df)
sub_df["failure"] = y_pred
sub_df.to_csv("submission.csv")
