In [2]:
import numpy as np
import csv
import pandas as pd
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from datetime import date, datetime

Using TensorFlow backend.


In [3]:
fold_count = 4
seed = 1337

model_dict = {
    'loss': 'categorical_crossentropy',
    'optimizer': 'adadelta',
    'layers': [{'nodecount': 32, 'activation': 'relu', 'dropout': 0.5},
               {'nodecount': 32, 'activation': 'relu', 'dropout': 0.5}],
    'dimension_out': 2
}

In [4]:
train_df = pd.read_csv('west_nile/input/train.csv')
#train_df.info()
test_df = pd.read_csv('west_nile/input/test.csv')
#test_df.info()
cnt_train_df = train_df.groupby(['Date', 'Species', 'Trap']).count()
#cnt_train_df[cnt_train_df['NumMosquitos'] > 1]
weather_df = pd.read_csv('west_nile/input/weather.csv')
weather_df = weather_df[weather_df['Station'] == 2]
spray_df = pd.read_csv('west_nile/input/spray.csv')

In [5]:
train_with_weather_df = pd.merge(left=train_df, right=weather_df, how='inner', left_on=['Date'], right_on=['Date'])
train_with_weather_df['month'] = train_with_weather_df.apply(lambda _: datetime.strptime(_['Date'], '%Y-%m-%d').date().month, axis=1)
train_with_weather_df['week'] = train_with_weather_df.apply(lambda _: _['month'] * 4 + datetime.strptime(_['Date'], '%Y-%m-%d').date().day / 7, axis=1)
train_target_df = train_with_weather_df[['WnvPresent']]
train_init_df = train_with_weather_df[['month', 'week', 'Latitude', 'Longitude', 'Tmax', 'Tmin', 
                                               'Tavg', 'DewPoint', 'WetBulb', 'StnPressure']]

In [6]:
np.random.seed(seed)
shuffle = np.arange(len(train_with_weather_df))
np.random.shuffle(shuffle)
train_target_df = train_target_df.iloc[shuffle]
train_init_df = train_init_df.iloc[shuffle]



In [7]:
scaler = StandardScaler()
scaler.fit(train_init_df)
train_init_array = scaler.transform(train_init_df)
train_target_id = np.asarray(train_target_df)
train_target_array = np_utils.to_categorical(train_target_id)

In [8]:
def build_model(model_dict):
    model = Sequential()
    input_dim = model_dict['dimension_input']
    for layer in model_dict['layers']:
        model.add(Dense(layer['nodecount'], input_dim=input_dim))
        model.add(Activation(layer['activation']))
        model.add(Dropout(layer['dropout']))
        input_dim = layer['nodecount']

    model.add(Dense(model_dict['dimension_output']))
    model.add(Activation('softmax'))

    model.compile(loss=model_dict['loss'], optimizer=model_dict['optimizer'])
    return model

model_dict['dimension_input'] = train_init_array.shape[1]
model_dict['dimension_output'] = train_target_array.shape[1] 
model = build_model(model_dict)

In [10]:
folds = KFold(len(train_target_id), fold_count)
mean_auroc = 0.

for i, (train, valid) in enumerate(folds):
    print('---'*20)
    print('Fold', i)
    print('---'*20)
    X_train = train_init_array[train]
    X_valid = train_init_array[valid]
    Y_train = train_target_array[train]
    Y_valid = train_target_array[valid]
    y_valid = train_target_id[valid]

    print("Building model...")
    model = build_model(model_dict)

    print("Training model...")

    model.fit(X_train, Y_train, epochs=100, batch_size=16, validation_data=(X_valid, Y_valid), verbose=0)
    valid_preds = model.predict_proba(X_valid, verbose=0)
    valid_preds_transform = valid_preds[:, 1]
    roc = metrics.roc_auc_score(y_valid, valid_preds_transform)
    print("ROC:", roc)
    mean_auroc += roc

print('Average ROC:', mean_auroc/fold_count)

------------------------------------------------------------
Fold 0
------------------------------------------------------------
Building model...
Training model...




ROC: 0.790618955513
------------------------------------------------------------
Fold 1
------------------------------------------------------------
Building model...
Training model...
ROC: 0.793178973717
------------------------------------------------------------
Fold 2
------------------------------------------------------------
Building model...
Training model...
ROC: 0.795925527496
------------------------------------------------------------
Fold 3
------------------------------------------------------------
Building model...
Training model...
ROC: 0.762509666057
Average ROC: 0.785558280696


In [None]:
fi = csv.reader(open("./west_nile/input/test.csv"))
head = fi.__next__()
indexes = dict([(head[i], i) for i in range(len(head))])
rows = []
ids = []
for line in fi:
    rows.append(process_line(line, indexes, weather_dic, weather_indexes))
    ids.append(line[0])
X_test = np.array(rows)
X_test, _ = preprocess_data(X_test, scaler)

preds = model.predict_proba(X_test, verbose=0)

fo = csv.writer(open("keras-nn.csv", "w"), lineterminator="\n")
fo.writerow(["Id","WnvPresent"])

for i, item in enumerate(ids):
    fo.writerow([ids[i], preds[i][1]])