In [15]:
import csv
from joblib import load
import os
import math
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression, LinearRegression
from joblib import dump
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [16]:
xTrainData = pd.read_csv("train.csv")
xTrainData = xTrainData.reset_index().set_index('id').drop('index', axis=1)

xTrainData['missingm3'] = xTrainData.measurement_3.isna()
xTrainData['missingm5'] = xTrainData.measurement_5.isna()

trainFeatures = []
for values in xTrainData.columns:
    if values == 'loading' or (values.startswith('measurement') and values != 'measurement_17'):
        trainFeatures.append(values)
chosenImputer = KNNImputer(missing_values=np.nan, n_neighbors=15)
chosenImputer.fit(xTrainData[trainFeatures])
xTrainData[trainFeatures] = chosenImputer.transform(xTrainData[trainFeatures])

###
# float_features = []
# for feature in xTrainData.columns:
#     if xTrainData[feature].dtypes == 'float64':
#         float_features.append(feature)

# standardAverage = []
# for feature in float_features:
#     if feature != 'loading':
#         standardAverage.append(feature)
        
# xTrainData['measurement3to17Average'] = xTrainData[standardAverage].mean(axis=1)
# xTrainData['measurement3to17stdev'] = xTrainData[standardAverage].std(axis=1)
###


comparableColumns = ['measurement_3', 'measurement_4', 'measurement_5','measurement_6', 'measurement_7', 'measurement_8', 'measurement_9']
xTrainData['measurement17predictions'] = ""
training_list = ['A', 'B', 'C', 'D', 'E']
for result in training_list:
    train_source = xTrainData[(xTrainData['product_code'] == result) & ~pd.isnull(xTrainData.measurement_17)]
    LRofMeasurement17 = LinearRegression().fit(train_source[comparableColumns], train_source['measurement_17'])
    sourceForPrediction = xTrainData[xTrainData['product_code'] == result]
    xTrainData.loc[sourceForPrediction.index, 'measurement17predictions'] = LRofMeasurement17.predict(sourceForPrediction[comparableColumns])
    xTrainData.loc[sourceForPrediction[pd.isnull(sourceForPrediction.measurement_17)].index, 'measurement_17'] = xTrainData.loc[sourceForPrediction[pd.isnull(sourceForPrediction.measurement_17)].index, 'measurement17predictions']

xTrainData['product_code'] = xTrainData['product_code']    
    

In [17]:
yTrainData = xTrainData.failure
xTrainData = xTrainData.drop(['failure'],axis=1)

In [18]:
PCAObjectives = [
    'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7',
    'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12',
    'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16'
]

dropped = ['measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7',
            'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12',
            'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16']

dropped2 = ['product_code', 'attribute_3', 'attribute_2', 'attribute_1', 'attribute_0']

pcaApplied = PCA(n_components=1)
xTrainData['pcaApplied'] = pcaApplied.fit_transform(xTrainData[PCAObjectives])

# xTrainData['measurement_2'] = xTrainData['measurement_2'].clip(11, None)

xTrainData['area'] = xTrainData['attribute_2'] * xTrainData['attribute_3']

xTrainData = xTrainData.drop(dropped, axis=1)

xTrainData = xTrainData.drop(dropped2, axis=1)


selectionOfFeatures = ['area', 'missingm5', 'missingm3', 'measurement_17', 'measurement_2', 'measurement_1', 'measurement_0', 'loading']
xTrainData[selectionOfFeatures] = StandardScaler().fit_transform(xTrainData[selectionOfFeatures])




In [19]:
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'solver': ['liblinear'],
              'max_iter': [1000, 10000, 100000]}

kf = KFold(n_splits=5, shuffle=True, random_state=1)

scores = []

for fold, (trainIndex, valueIndex) in enumerate(kf.split(xTrainData, yTrainData)):
    xTrainDataFold, yTrainDataFold = xTrainData.iloc[trainIndex], yTrainData.iloc[trainIndex]
    xValueFold, yValueFold = xTrainData.iloc[valueIndex], yTrainData.iloc[valueIndex]
    grid_search = GridSearchCV(estimator=LogisticRegression(), param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
    model = make_pipeline(StandardScaler(), LogisticRegression(penalty='l1', C=0.01, solver='liblinear', random_state=1))
    grid_search.fit(xTrainDataFold, yTrainDataFold)
    model = grid_search.best_estimator_
    y_pred = model.predict_proba(xValueFold)[:, 1]
    score = roc_auc_score(yValueFold, y_pred)
    scores.append(score)
    print(f"Fold {fold}: {score:.5f}")
print(f"Average ROC AUC = {sum(scores) / len(scores):.5f}")

Fold 0: 0.59089
Fold 1: 0.59299
Fold 2: 0.58226
Fold 3: 0.60640
Fold 4: 0.57984
Average ROC AUC = 0.59048


In [20]:
param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'solver': ['liblinear'],
              'max_iter': [1000, 10000, 100000]}
grid_search = GridSearchCV(estimator=LogisticRegression( random_state=1), param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
grid_search.fit(xTrainData, yTrainData)

model = grid_search.best_estimator_
model.fit(xTrainData, yTrainData)
dump(model, 'predictions.joblib') 

['predictions.joblib']