In [None]:
import numpy as np
import csv
import pandas as pd
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.utils import np_utils
from sklearn import metrics
from sklearn.cross_validation import KFold
from sklearn.preprocessing import StandardScaler
from datetime import date, datetime
from copy import deepcopy
from IPython.display import clear_output


from utils import read_basic_dataset

from features import get_month_virus_share, create_trap_distance_matrix, get_nearest_trap, get_nearest_trap_list

In [None]:
fold_count = 5
seed = 1337

# Load Basic Data

In [None]:
training_features, training_target, test_features = read_basic_dataset()

In [None]:
trap_records = pd.concat([training_features[['Trap', 'Latitude', 'Longitude']], 
                          test_features[['Trap', 'Latitude', 'Longitude']]])
trap_distance_matrix = create_trap_distance_matrix(trap_records)
#trap_distance_matrix.info()

# Data Preparation

In [None]:

virus_per_trap_species, virus_per_trap, virus_per_species_overall = get_month_virus_share(training_features)
training_features = pd.merge(how='left', left=training_features, right=virus_per_trap_species, 
                             left_on=['month', 'Trap', 'Species'], right_index=True)
training_features['virusshare'] = training_features.apply(lambda _: _['virusshare_ts'], axis=1)
training_features['species_virus'] = training_features.Species.\
    apply(lambda _: virus_per_species_overall.loc[_]['virusshare_s'] if _ in virus_per_species_overall.index else -1)
#cols_to_use = test_features.columns.difference(virus_share_df.columns)
test_features = pd.merge(how='left', left=test_features, right=virus_per_trap_species, 
                             left_on=['month', 'Trap', 'Species'], right_index=True)
test_features = pd.merge(how='left', left=test_features, right=virus_per_trap, 
                             left_on=['month', 'Trap'], right_index=True)
test_features['virusshare'] = -1
test_features['location_distance'] = -1
test_features['species_virus'] = test_features.Species.\
    apply(lambda _: virus_per_species_overall.loc[_]['virusshare_s'] 
          if _ in virus_per_species_overall.index else -1)
#test_features.head()

In [None]:
for i,row in test_features.iterrows():
    print(i)
    #import pdb; pdb.set_trace()
    row = row.copy()
    if not pd.isnull(row['virusshare_ts']):
        row['virusshare'] = row['virusshare_ts']
        row['location_distance'] = 0
    elif not pd.isnull(row['virusshare_t']):
        row['virusshare'] = row['virusshare_t']
        row['location_distance'] = 0    
    else:
        #import pdb; pdb.set_trace()
        station_list = get_nearest_trap_list(trap_distance_matrix, row['Trap'])
        for j, s in station_list.iterrows():
            if (row['month'], j) in virus_per_trap.index:
                row['virusshare'] = virus_per_trap.loc[row['month'], j]['virusshare_t']
                row['location_distance'] = s['distance'] 
    test_features.loc[i, 'virusshare'] = row['virusshare']
    test_features.loc[i, 'location_distance'] = row['location_distance']
    clear_output()


In [None]:
test_features.to_pickle('test_features_virus.pickle')

In [None]:
test_features = pd.read_pickle('test_features_virus.pickle')

In [None]:
keep_features = ['month', 'week', 'Latitude', 'Longitude', 
                 'Tmax', 'Tmin', 'Tavg', 'DewPoint', 'WetBulb', 'StnPressure',
                 'virusshare', 'species_virus']
training_features_input = training_features[keep_features]
test_features_input = test_features[keep_features]

In [None]:
np.random.seed(seed)
shuffle = np.arange(len(training_features_input))
np.random.shuffle(shuffle)
training_target_input = training_target.iloc[shuffle]
training_features_input = training_features_input.iloc[shuffle]



In [None]:
scaler = StandardScaler()
scaler.fit(training_features_input)
training_feature_array = scaler.transform(training_features_input)
training_target_array = np.asarray(training_target_input)
test_feature_array = scaler.transform(test_features_input.fillna(0))



# Model Definition

In [None]:
def train_model(fold_cnt, feature_array, target_array, model_generator, fitting_function):
    folds = KFold(len(target_array), fold_count)
    mean_auroc_valid = 0.
    mean_auroc_train = 0
    target_array_categorical = np_utils.to_categorical(target_array)
    trained_models = list()
    for i, (train, valid) in enumerate(folds):
        print('---'*20)
        print('Fold', i)
        print('---'*20)
        X_train = feature_array[train]
        X_valid = feature_array[valid]
        Y_train = target_array_categorical[train]
        y_train = target_array[train]
        Y_valid = target_array_categorical[valid]
        y_valid = target_array[valid]

        print("Building model...")
        foldmodel = model_generator()
        print("Training model...")
        train_and_valid_data = (X_train, Y_train, y_train, X_valid, Y_valid, y_valid)
        fitting_function(foldmodel, train_and_valid_data)
        trained_models.append(foldmodel)
        #foldmodel.fit(, epochs=100, batch_size=16, validation_data=(), verbose=0)
        valid_preds = foldmodel.predict_proba(X_valid)
        training_preds = foldmodel.predict_proba(X_train)
        roc_valid = metrics.roc_auc_score(y_valid, valid_preds[:, 1])
        roc_train = metrics.roc_auc_score(y_train, training_preds[:, 1])
        print("ROC: {} training, {} validation".format(roc_train, roc_valid))
        mean_auroc_train += roc_train
        mean_auroc_valid += roc_valid
            
    print('Average ROC:', mean_auroc_train/fold_count, mean_auroc_valid/fold_count)
    return trained_models

    

## XGB

In [None]:
#xgb = XGBClassifier()
#xgb.fit(training_feature_array, training_target_array.ravel())
#xgb.predict_proba(np.array(test_feature_array))

def xgb_model_builder(xgb_model_dict=None):
    if not xgb_model_dict:
        xgb_model_dict=dict()
    return XGBClassifier(**xgb_model_dict)

def xgb_fitting_function(xgb_model, tvd):
    xgb_model.fit(tvd[0], tvd[2].ravel())
    
tvd = train_model(fold_count, training_feature_array, training_target_array, xgb_model_builder, xgb_fitting_function)

In [None]:
xgb = xgb_model_builder()
xgb_fitting_function(xgb, (training_feature_array,None, training_target_array))
preds = xgb.predict_proba(np.nan_to_num(test_feature_array))
preds

## Dense Neural Network

In [None]:
model_dict = {
    'loss': 'categorical_crossentropy',
    'optimizer': 'adadelta',
    'layers': [{'nodecount': 20, 'activation': 'relu', 'dropout': 0.5},
               {'nodecount': 10, 'activation': 'relu', 'dropout': 0.25},
               {'nodecount': 5, 'activation': 'relu', 'dropout': 0.125}],
    'dimension_out': 2
}

In [None]:
def build_model(model_dict):
    model = Sequential()
    input_dim = model_dict['dimension_input']
    for layer in model_dict['layers']:
        model.add(Dense(layer['nodecount'], input_dim=input_dim))
        model.add(Activation(layer['activation']))
        model.add(Dropout(layer['dropout']))
        input_dim = layer['nodecount']

    model.add(Dense(model_dict['dimension_output']))
    model.add(Activation('softmax'))

    model.compile(loss=model_dict['loss'], optimizer=model_dict['optimizer'])
    return model

model_dict['dimension_input'] = training_feature_array.shape[1]
model_dict['dimension_output'] = len(np.unique(training_target_array))

def keras_model_builder():
    return build_model(model_dict)
def keras_fit(keras_model, tvd):
    keras_model.fit(tvd[0], tvd[1], epochs=50, batch_size=32, validation_data=(tvd[3], tvd[4]), verbose=1)

fnn = keras_model_builder()
tvd = train_model(fold_count, training_feature_array, training_target_array, keras_model_builder, keras_fit)
#keras_fit(fnn, )
#FNN = train_model(4, train_init_array, train_target_id, keras_model_builder, keras_fit)

In [None]:
prediction_array = [fnn.predict_proba(np.nan_to_num(test_feature_array))[:,1] for fnn in tvd]
bagged_prediction = np.mean(np.array(prediction_array), axis=0)

In [None]:
test_features['WnvPresent'] = preds

In [None]:
export_df = test_features[['Id', 'WnvPresent']]
export_df.to_csv('submission_08.csv', index=False , quotechar='"')