In [36]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
import os

In [37]:
# to make this notebook's output stable across runs
RANDOM_SEED = 42;
np.random.seed(RANDOM_SEED)

In [38]:
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12

In [39]:
# Where to save the figures
PROJECT_ROOT_DIR = "."
DATA_PATH = os.path.join(PROJECT_ROOT_DIR, "data")
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images")

In [40]:
def load_data(file,data_path=DATA_PATH, sep=','):
    csv_path = os.path.join(data_path, file)
    return pd.read_csv(csv_path, sep)

In [41]:
TX_data = load_data(file = "input_train.csv");
STA_data = load_data(file = "challenge_output_data_training_file_prediction_of_transaction_claims_status.csv", sep=';');

In [42]:
TX_data = pd.merge(TX_data, STA_data, left_index=True, right_index=True)
TX_data.drop(["ID_y","ID_x"],inplace=True,axis=1)

In [43]:
del STA_data

In [44]:
Y = TX_data["CLAIM_TYPE"]
X = TX_data.drop("CLAIM_TYPE", axis=1,inplace=False)

In [45]:
def splitter(data, list_col):
    for colname in list_col:
        toto= data[colname].str.split('<', 1, expand=True)
        data[colname+"_MIN"] = toto[0]
        data[colname+"_MAX"] = toto[1]
        data[colname+"_MIN"]=data[colname+"_MIN"].str.replace('>',"")
        data.drop(colname, axis=1, inplace=True)

In [46]:
list_col_split=["WARRANTIES_PRICE",'SELLER_SCORE_COUNT','ITEM_PRICE','PURCHASE_COUNT','SHIPPING_PRICE']

In [47]:
splitter(X,list_col_split)

In [48]:
X=X.apply(pd.to_numeric, errors='ignore')

In [49]:
X=X.select_dtypes(exclude=['object'])

In [50]:
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(Y)
encoded_Y = encoder.transform(Y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [55]:
# define baseline model
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=20, kernel_initializer='normal', activation='relu'))
    model.add(Dense(8, kernel_initializer='normal', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [56]:
estimator = KerasClassifier(build_fn=baseline_model, epochs=200, batch_size=5)#, verbose=0)

In [57]:
kfold = KFold(n_splits=10, shuffle=True, random_state=RANDOM_SEED)

In [58]:
results = cross_val_score(estimator, X, dummy_y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
 3850/90000 [>.............................] - ETA: 27s - loss: nan - acc: 0.5065

KeyboardInterrupt: 