In [371]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler

import warnings
warnings.filterwarnings('ignore')

In [372]:
train_df = pd.read_csv('data/train.csv', index_col='id')
test_df = pd.read_csv('data/test.csv', index_col='id')
sub_df = pd.read_csv('sample_submission.csv', index_col='id')
print(train_df.shape)
print(test_df.shape)
print(sub_df.shape)

(26570, 25)
(20775, 24)
(20775, 1)


In [373]:
total_df = pd.concat([train_df.drop('failure', axis=1), test_df], axis=0)

total_df['product_code'] = total_df['product_code'].map({'A':0, 'B':1, 'C':2, 'D':3, 'E':4, 'F':5, 'G':6, 'H':7, 'I':8})
total_df['attribute_0'] = total_df['attribute_0'].map({'material_5':0, 'material_7':1})
total_df['attribute_1'] = total_df['attribute_1'].map({'material_5':0, 'material_6':1, 'material_7':2, 'material_7':8})

imputer = SimpleImputer(strategy='median')
total_df = pd.DataFrame(imputer.fit_transform(total_df), columns=total_df.columns, index=total_df.index)

In [374]:
train_data = total_df.iloc[: train_df.shape[0], :]
test_data = total_df.iloc[train_df.shape[0] :, :]

# I am using PCA to reduce the number of features, 
# hopefully I can reduce some of the noise to avoid overfitting
pca = PCA(n_components=2)
train_data = pca.fit_transform(train_data)
test_data = pca.transform(test_data)

# Scaling the data
scaler = MinMaxScaler()
train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

In [375]:
X = train_data.copy()
target = train_df['failure']

In [376]:
model = LinearRegression(fit_intercept = True)
model.fit(X, target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [377]:
y_pred = model.predict(test_data)
sub_df['failure'] = y_pred
sub_df.to_csv('submission.csv')