In [1]:
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import plot_model

from tqdm import tqdm_notebook
import matplotlib.pyplot as plt

In [2]:
#load & view data

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
# 觀看資料集的內容
#print("shape of train", train.shape)
#print("shape of test",test.shape)
#print(train.columns)

#train.head()

In [3]:
#checkout predict data column

#train['class'].describe()

In [4]:
#f, ax = plt.subplots(figsize=(10, 7.5))
#sns.distplot(train['class'], rug=True)

In [5]:
y = train.pop('class')
print(y.shape)

(7160,)


In [6]:
#concat the train and test data & change index (dropped ID & sale price)
alldata = pd.concat((train.loc[:,'appearedTimeOfDay':'cooc_151'],
                     test.loc[:,'appearedTimeOfDay':'cooc_151']))
alldata = alldata.reset_index(drop=True)
print(alldata.shape)

(8951, 182)


In [7]:
#count all missing values of each columns
missing=[]
for col in alldata.columns:
    if alldata[col].isnull().sum() > 0:
        missing.append(col)
        print(col, " : ", alldata[col].isnull().sum())

print("none is missing")

none is missing


In [8]:
#### transfer data to numerical
#print("alldata:",alldata.columns.tolist()) #see all title of coumn


In [9]:
#'appearedHour', 'appearedMinute', combine
alldata['appearedtime']=  alldata['appearedHour']+(alldata['appearedMinute']/60)  
alldata.drop(alldata.iloc[:, 1:3], inplace = True, axis = 1)
print(alldata['appearedtime'][:5])
print(alldata.shape)

0    8.433333
1    2.583333
2    0.083333
3    7.633333
4    1.450000
Name: appearedtime, dtype: float64
(8951, 181)


In [10]:
#elimited city (create too many feature if using one-hot)
#elimited appearedHour & appearedMinute for late night/early morning bias

#alldata.drop(['appearedMinute', 'pokestopIn100m', 'pokestopIn250m', 'pokestopIn500m', 'pokestopIn1000m', 'pokestopIn2500m', 'pokestopIn5000m','gymIn100m', 'gymIn250m', 'gymIn500m', 'gymIn1000m', 'gymIn2500m', 'gymIn5000m'], inplace = True, axis = 1)

#print(alldata.shape)
#print(alldata['city'][:5])

In [11]:
#convert all boolean columns

def bool_num(data):
    for col in alldata.columns:
        if alldata[col].dtype ==bool:
            alldata[col] = alldata[col].astype(int)
    return data
alldata = bool_num(alldata)
print(alldata['urban'][:5])
print(alldata.shape)

0    1
1    1
2    0
3    1
4    0
Name: urban, dtype: int64
(8951, 181)


In [12]:
  # One-hot encoding
#alldata = keras.utils.to_categorical(alldata['city'])
#alldata= keras.utils.to_categorical(alldata['continent'])
alldata = pd.get_dummies(alldata)

print(alldata.shape)

(8951, 297)


In [13]:
# feature scaling - normalization
numerical_alldata = ['appearedHour','temperature', 'windSpeed', 'pressure', 'population_density', 'gymDistanceKm','pokestopDistanceKm']
for col in alldata.columns:
    if col in numerical_alldata:
        alldata[col] =(alldata[col]-alldata[col].min(axis=0))/(alldata[col].max(axis=0)-alldata[col].min(axis=0)+1e-7) 
print(alldata['temperature'][:5])
print(alldata.shape)


0    0.329670
1    0.379121
2    0.692308
3    0.376374
4    0.423077
Name: temperature, dtype: float64
(8951, 297)


In [14]:
#get data back to train vs test
preproced_record_number = 7160
X_train = np.asarray(alldata[:preproced_record_number])
X_test = alldata[preproced_record_number:]
y_train = np.asarray(y)
y_train = np.asarray(pd.get_dummies(y_train)) #one-hot the training data
print(X_train.shape)
print(y_train.shape)

(7160, 297)
(7160, 6)


In [15]:
y_train[:20]

array([[0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0]], dtype=uint8)

In [16]:
#build model

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.layers import Dense, Dropout,BatchNormalization
from tensorflow.keras import regularizers

   
 # early stop
earlys_stopping = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)
   
 # model checkpoint
checkpoint_filepath = './best_model.h5'
model_check_point = ModelCheckpoint(checkpoint_filepath, monitor='val_acc', mode='max', verbose=1, save_best_only=False, save_freq='epoch')


In [27]:
model = tf.keras.Sequential()
model.add(Dense(128, activation='elu', input_shape=(297, ), ))
model.add(Dropout(0.45))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
model.add(Dropout(0.45))
model.add(BatchNormalization())
# model.add(Dense(8, activation='relu', kernel_regularizer=regularizers.l2(0.02)))
# model.add(BatchNormalization())
model.add(Dense(6, activation='softmax'))
   
 # compile
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])
   

 # fit
model_history = model.fit(x=X_train, 
                           y=y_train,
                           batch_size=80, 
                           epochs=1000,
                           validation_split= 0.2,
                           shuffle=True,
                           callbacks=[earlys_stopping, model_check_point]
                           )


Train on 5728 samples, validate on 1432 samples
Epoch 1/1000
Epoch 00001: saving model to ./best_model.h5
Epoch 2/1000
Epoch 00002: saving model to ./best_model.h5
Epoch 3/1000
Epoch 00003: saving model to ./best_model.h5
Epoch 4/1000
Epoch 00004: saving model to ./best_model.h5
Epoch 5/1000
Epoch 00005: saving model to ./best_model.h5
Epoch 6/1000
Epoch 00006: saving model to ./best_model.h5
Epoch 7/1000
Epoch 00007: saving model to ./best_model.h5
Epoch 8/1000
Epoch 00008: saving model to ./best_model.h5
Epoch 9/1000
Epoch 00009: saving model to ./best_model.h5
Epoch 10/1000
Epoch 00010: saving model to ./best_model.h5
Epoch 11/1000
Epoch 00011: saving model to ./best_model.h5
Epoch 12/1000
Epoch 00012: saving model to ./best_model.h5
Epoch 13/1000
Epoch 00013: saving model to ./best_model.h5
Epoch 14/1000
Epoch 00014: saving model to ./best_model.h5
Epoch 15/1000
Epoch 00015: saving model to ./best_model.h5
Epoch 16/1000
Epoch 00016: saving model to ./best_model.h5
Epoch 17/1000
Epo

In [28]:
 best_model = tf.keras.models.load_model(checkpoint_filepath)
 y_predict = best_model.predict_classes(X_test)



In [29]:
test_submission = pd.DataFrame({'Id':test['id'], 'class':  y_predict})
test_submission.to_csv('./test_submission.csv', index = False)