In [None]:
import numpy as np
import csv
import pandas as pd
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from datetime import date, datetime
from copy import deepcopy
from IPython.display import clear_output


from utils import read_basic_dataset

from features import get_month_virus_share, create_trap_distance_matrix, get_nearest_trap, get_nearest_trap_list, \
    add_multirows

In [None]:
fold_count = 5
seed = 1337
train_verbose = 0



# Load Basic Data

In [None]:
training_features, training_target, test_features = read_basic_dataset()

In [None]:
trap_records = pd.concat([training_features[['Trap', 'Latitude', 'Longitude']], 
                          test_features[['Trap', 'Latitude', 'Longitude']]])
trap_distance_matrix = create_trap_distance_matrix(trap_records)
#trap_distance_matrix.info()

# Data Preparation

In [None]:
training_features = add_multirows(training_features)
test_features = add_multirows(test_features)

In [None]:
keep_features = ['week', 'Latitude', 'Longitude', 
                 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'StnPressure', 'PrecipTotal',
                 '', '10dtmin_min', '10dtavg_avg', '10dpcp_tot', '10ddwp_avg', '10dprs_avg']
drop_features = ['AddressAccuracy','AddressNumberAndStreet','Address', 'Block', 'Date', 
                 'Heat', 'Cool', 'Sunrise', 'Sunset','Depth','Water1','SeaLevel', 'SnowFall', 'CodeSum', 
                 'Depart', 'WetBulb', 'ResultSpeed', 'ResultDir', 'AvgSpeed', 
                 'month', 'Species', 'station','Street', 'Trap', 'Station', 'ddate', 
                 #'10dtmax_max',
                 '10dtmax_min', 
                 '10dtmax_avg', 
                 '10dtavg_max', 
                 #'10dtavg_avg', 
                 #'10dtavg_min', 
                 '10dtmin_max', 
                 '10dtmin_avg',
                 #'10dtmin_min', 
                 #'10dpcp_tot', 
                 #'10ddwp_avg', 
                 #'10dprs_avg'
                ]

training_features_input = training_features.drop(drop_features + ['NumMosquitos', 'WnvPresent'], axis=1)
test_features_input = test_features.drop(drop_features +['Id'], axis=1)


In [None]:
np.random.seed(seed)
shuffle = np.arange(len(training_features_input))
np.random.shuffle(shuffle)
training_target_input = training_target.iloc[shuffle]
training_features_input = training_features_input.iloc[shuffle]



In [None]:
scaler = StandardScaler()
scaler.fit(training_features_input)
training_feature_array = scaler.transform(training_features_input)
training_target_array = np.asarray(training_target_input)
test_feature_array = scaler.transform(test_features_input.fillna(0))



# Model Definition

In [None]:
def train_model(fold_cnt, feature_array, target_array, model_generator, fitting_function):
    folds = KFold(len(target_array), fold_count)
    mean_auroc_valid = 0.
    mean_auroc_train = 0
    target_array_categorical = np_utils.to_categorical(target_array)
    trained_models = list()
    for i, (train, valid) in enumerate(folds):
        print('Fold', i)
        X_train = feature_array[train]
        X_valid = feature_array[valid]
        Y_train = target_array_categorical[train]
        y_train = target_array[train]
        Y_valid = target_array_categorical[valid]
        y_valid = target_array[valid]
        foldmodel = model_generator()
        train_and_valid_data = (X_train, Y_train, y_train, X_valid, Y_valid, y_valid)
        fitting_function(foldmodel, train_and_valid_data)
        trained_models.append(foldmodel)
        valid_preds = foldmodel.predict_proba(X_valid)
        training_preds = foldmodel.predict_proba(X_train)
        roc_valid = metrics.roc_auc_score(y_valid, valid_preds[:, 1])
        roc_train = metrics.roc_auc_score(y_train, training_preds[:, 1])
        #print("ROC: {} training, {} validation".format(roc_train, roc_valid))
        mean_auroc_train += roc_train
        mean_auroc_valid += roc_valid
            
    print('Average ROC:', mean_auroc_train/fold_count, mean_auroc_valid/fold_count)
    return trained_models

    

## XGB

In [None]:
#xgb = XGBClassifier()
#xgb.fit(training_feature_array, training_target_array.ravel())
#xgb.predict_proba(np.array(test_feature_array))

def xgb_model_builder(xgb_model_dict=None):
    if not xgb_model_dict:
        xgb_model_dict={'n_estimators': 200,
                        'max_depth': 4,
                        'reg_alpha':0.01,
                        'seed':seed}
    return XGBClassifier(**xgb_model_dict)

def xgb_fitting_function(xgb_model, tvd):
    xgb_model.fit(tvd[0], tvd[2].ravel())
    
xgbms = train_model(fold_count, training_feature_array, training_target_array, xgb_model_builder, xgb_fitting_function)

In [None]:
xgb_prediction_array = [xgb.predict_proba(np.nan_to_num(test_feature_array))[:,1] 
                        for xgb in xgbms]
bagged_xgb_prediction = np.mean(np.array(xgb_prediction_array), axis=0)

## Dense Neural Network

In [None]:
model_dict = {
    'loss': 'categorical_crossentropy',
    'optimizer': 'adadelta',
    'layers': [{'nodecount': 20, 'activation': 'relu', 'dropout': 0.5},
               {'nodecount': 10, 'activation': 'relu', 'dropout': 0.25},
               {'nodecount': 5, 'activation': 'relu', 'dropout': 0.125}],
    'dimension_out': 2
}

In [None]:
def build_model(model_dict):
    model = Sequential()
    input_dim = model_dict['dimension_input']
    for layer in model_dict['layers']:
        model.add(Dense(layer['nodecount'], input_dim=input_dim))
        model.add(Activation(layer['activation']))
        model.add(Dropout(layer['dropout']))
        input_dim = layer['nodecount']

    model.add(Dense(model_dict['dimension_output']))
    model.add(Activation('softmax'))

    model.compile(loss=model_dict['loss'], optimizer=model_dict['optimizer'])
    return model

model_dict['dimension_input'] = training_feature_array.shape[1]
model_dict['dimension_output'] = len(np.unique(training_target_array))

def keras_model_builder():
    return build_model(model_dict)
def keras_fit(keras_model, tvd):
    keras_model.fit(tvd[0], tvd[1], epochs=50, batch_size=32, validation_data=(tvd[3], tvd[4]), verbose=train_verbose)

fnn = keras_model_builder()
fnnms = train_model(fold_count, training_feature_array, training_target_array, keras_model_builder, keras_fit)
#keras_fit(fnn, )
#FNN = train_model(4, train_init_array, train_target_id, keras_model_builder, keras_fit)

In [None]:
fnn_prediction_array = [fnn.predict_proba(np.nan_to_num(test_feature_array))[:,1] for fnn in fnnms]
bagged_fnn_prediction = np.mean(np.array(fnn_prediction_array), axis=0)

In [None]:
test_features['WnvPresent'] = bagged_xgb_prediction
export_df = test_features[['Id', 'WnvPresent']]
export_df.to_csv('multirow_only.csv', index=False , quotechar='"')