In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

import optuna

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [3]:
train.columns

Index(['id', 'X_Minimum', 'X_Maximum', 'Y_Minimum', 'Y_Maximum',
       'Pixels_Areas', 'X_Perimeter', 'Y_Perimeter', 'Sum_of_Luminosity',
       'Minimum_of_Luminosity', 'Maximum_of_Luminosity', 'Length_of_Conveyer',
       'TypeOfSteel_A300', 'TypeOfSteel_A400', 'Steel_Plate_Thickness',
       'Edges_Index', 'Empty_Index', 'Square_Index', 'Outside_X_Index',
       'Edges_X_Index', 'Edges_Y_Index', 'Outside_Global_Index', 'LogOfAreas',
       'Log_X_Index', 'Log_Y_Index', 'Orientation_Index', 'Luminosity_Index',
       'SigmoidOfAreas', 'Pastry', 'Z_Scratch', 'K_Scatch', 'Stains',
       'Dirtiness', 'Bumps', 'Other_Faults'],
      dtype='object')

In [4]:
# X,y split
target_classes = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
target_bins = train[target_classes]

X = train.drop(target_bins, axis=1, inplace=True)
X = train.drop('id', axis=1)

In [5]:
test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

In [6]:
target_classes = ['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps', 'Other_Faults']
sub = submission.drop(target_classes, axis=1)

In [7]:
test = test.drop('id', axis=1)

In [8]:
test

Unnamed: 0,X_Minimum,X_Maximum,Y_Minimum,Y_Maximum,Pixels_Areas,X_Perimeter,Y_Perimeter,Sum_of_Luminosity,Minimum_of_Luminosity,Maximum_of_Luminosity,...,Outside_X_Index,Edges_X_Index,Edges_Y_Index,Outside_Global_Index,LogOfAreas,Log_X_Index,Log_Y_Index,Orientation_Index,Luminosity_Index,SigmoidOfAreas
0,1015,1033,3826564,3826588,659,23,46,62357,67,127,...,0.0095,0.5652,1.0000,1.0,2.8410,1.1139,1.6628,0.6727,-0.2261,0.9172
1,1257,1271,419960,419973,370,26,28,39293,92,132,...,0.0047,0.2414,1.0000,1.0,2.5682,0.9031,1.4472,0.9063,-0.1453,0.9104
2,1358,1372,117715,117724,289,36,32,29386,101,134,...,0.0155,0.6000,0.7500,0.0,2.4609,1.3222,1.3222,-0.5238,-0.0435,0.6514
3,158,168,232415,232440,80,10,11,8586,107,140,...,0.0037,0.8000,1.0000,1.0,1.9031,0.6990,1.0414,0.1818,-0.0738,0.2051
4,559,592,544375,544389,140,19,15,15524,103,134,...,0.0158,0.8421,0.5333,0.0,2.1461,1.3222,1.1461,-0.5714,-0.0894,0.4170
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12809,1101,1116,447943,447992,313,32,37,21603,79,103,...,0.0126,0.4063,0.9194,1.0,2.4955,1.2305,1.6335,0.7661,-0.3109,0.8894
12810,1289,1306,3149494,3149542,59,9,18,5249,113,141,...,0.0052,0.7778,1.0000,1.0,1.7708,0.8451,1.2553,0.7222,-0.0448,0.1954
12811,41,210,1587535,1587191,16584,796,522,1858162,24,143,...,0.1236,0.2199,0.4097,0.0,4.2525,2.2504,2.2672,-0.0629,-0.0801,1.0000
12812,1329,1340,702237,702267,386,43,34,36875,66,124,...,0.0095,0.2407,1.0000,1.0,2.5866,1.1139,1.5911,0.8461,-0.2629,0.7844


In [9]:
target_names = target_bins.columns
target_names

Index(['Pastry', 'Z_Scratch', 'K_Scatch', 'Stains', 'Dirtiness', 'Bumps',
       'Other_Faults'],
      dtype='object')

In [10]:
target_names = target_bins.columns
for target in target_names:
    print(f'Target fitting: {target}')
    clf = RandomForestClassifier(n_estimators=200, max_depth=30, max_features=0.9,
                                 max_samples=0.9, n_jobs=-1, random_state=1403)
    clf.fit(X, target_bins[target])
    test_proba = clf.predict_proba(test)
    submission[target] = test_proba[:, 1] 

Target fitting: Pastry
Target fitting: Z_Scratch
Target fitting: K_Scatch
Target fitting: Stains
Target fitting: Dirtiness
Target fitting: Bumps
Target fitting: Other_Faults


In [11]:
submission

Unnamed: 0,id,Pastry,Z_Scratch,K_Scatch,Stains,Dirtiness,Bumps,Other_Faults
0,19219,0.410,0.000,0.000000,0.000,0.015,0.215,0.325000
1,19220,0.345,0.005,0.035000,0.000,0.175,0.175,0.298225
2,19221,0.000,0.185,0.025000,0.000,0.005,0.305,0.475000
3,19222,0.205,0.000,0.000000,0.000,0.015,0.430,0.475000
4,19223,0.000,0.000,0.000000,0.000,0.000,0.625,0.335000
...,...,...,...,...,...,...,...,...
12809,32028,0.070,0.080,0.000000,0.000,0.050,0.350,0.285135
12810,32029,0.160,0.005,0.075000,0.065,0.220,0.280,0.440000
12811,32030,0.000,0.000,0.873877,0.000,0.000,0.000,0.060516
12812,32031,0.415,0.240,0.035000,0.000,0.070,0.310,0.360000


In [12]:
submission.to_csv('submission.csv', index=False)