In [461]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier

In [508]:
test = pd.read_csv('./assets/test.csv')
train = pd.read_csv('./assets/train.csv')
spray = pd.read_csv('./assets/spray.csv')
weather = pd.read_csv('./assets/weather_clean.csv')

In [419]:
def split_date(df):
    df.Date = pd.to_datetime(df.Date)
    df['year'] = df.Date.dt.year
    df['month'] = df.Date.dt.month
    df['day'] = df.Date.dt.day
    df['week'] = df.Date.dt.week
    df.Date = df.Date.dt.date
    return df

def trap_bias(df):
    form = lambda x: np.sum(x)/float(x.count())*100
    transformed = df[['Trap','WnvPresent']].groupby('Trap').agg(form)
    return transformed.reset_index().rename(columns={'WnvPresent':'Bias'})

def misq_leakage(df):
    grouped = df[['Date','Trap','Address']].groupby(['Date','Trap'])[['Address']].count()
    grouped = grouped.reset_index().rename(columns={'Address':'MCount'})
    return grouped

def GetDummies(df):   
    dummies=pd.get_dummies(df['Species'])
    df = pd.concat([df, dummies], axis=1)
    return df  

def clean_weather(df):
    filter_out = ['Heat', 'CodeSum', 'Depth', 'SnowFall', 'StnPressure',  'SeaLevel', 'AvgSpeed','Sunrise','Sunset']
    df.drop(filter_out, axis=1, inplace=True)
    df.Date = pd.to_datetime(df.Date)
    df.Date = df.Date.dt.date
    return df

def MergeWeather(df1, df2):
    df = df1.merge(df2[df2['Station']==1], on='Date', how="left",  left_index=True)
    return df 

def MergeBias(df1, df2):
    df = df1.merge(df2, on='Trap', how="left",  left_index=True)
    return df

def MergeML(df1, df2):
    df = df1.merge(df2, on=['Date','Trap'], how="left",  left_index=True)
    return df

def drop_train_cols(df):
    train_filt = ['Date','Address','Street','Trap','AddressNumberAndStreet','AddressAccuracy','Species','Block']
    df.drop(train_filt, axis=1, inplace=True)
    return df

def transform_data(train,test,weather):
    train = split_date(train)
    bias = trap_bias(train)
    ml = misq_leakage(train)
    train = GetDummies(train)
    weather = clean_weather(weather)
    train = MergeWeather(train,weather)
    train = MergeBias(train,bias)
    train = MergeML(train,ml)
    train = drop_train_cols(train)

    test = split_date(test)
    ml = misq_leakage(test)
    test = GetDummies(test)
    test = MergeWeather(test,weather)
    test = MergeBias(test,bias)
    test = MergeML(test,ml)
    test = drop_train_cols(test)
    return train, test

In [509]:
Xtrain, Xtest= transform_data(train,test,weather)

In [472]:
filt_y = ['WnvPresent','NumMosquitos']
Xtr = Xtrain.drop(filt_y, axis=1)
ytr = train['WnvPresent']

In [424]:
lr = LogisticRegression()
lr.fit(Xtr,ytr)
print lr.score(Xtr,ytr)
print 1-np.mean(y)

0.948220064725
0.947553778793


In [463]:
gb = GradientBoostingClassifier()
gb.fit(Xtr,ytr)
print gb.score(Xtr,ytr)

0.949647820293


In [473]:
rf = RandomForestClassifier()
rf.fit(Xtr, ytr)
print rf.score(Xtr,ytr)
predict = rf.predict(Xtst)

0.976679992385


In [449]:
predict = lr.predict(Xtst)

In [474]:
Xtest['prediction'] = predict
Xtest.rename(columns={'prediction':'WnvPresent'},inplace=True)
Xtest[['Id','WnvPresent']].to_csv('kaggle_answers.csv', index=False)

In [475]:
Xtest.rename(columns={'prediction':'WnvPresent'},inplace=True)

In [476]:
Xtest[['Id','WnvPresent']].to_csv('kaggle_answers.csv', index=False)

In [None]:
spray_dates = spray.Date.unique()

test2 = test.head()
for date in spray_dates:
    test2[date] = 0
    
for i in test2.index:
    print i


In [500]:
xnp = Xtr.values
ynp = ytr.values

In [531]:
from keras.models import Sequential
from keras.layers import Dense
import numpy

model = Sequential()
model.add(Dense(29, input_dim=29, init='uniform', activation='relu'))
model.add(Dense(18, init='uniform', activation='relu'))
model.add(Dense(9, init='uniform', activation='relu'))
model.add(Dense(1, init='uniform', activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(xnp, ynp, nb_epoch=100, batch_size=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x145d12950>

In [532]:
predict = model.predict(Xtst.values)
Xtest['WnvPresent'] = predict
#Xtest.rename(columns={'prediction':'WnvPresent'},inplace=True)
Xtest[['Id','WnvPresent']].to_csv('kaggle_answers.csv', index=False)

In [None]:
from geopy.distance import vincenty
import numpy as np

test_points = zip(train.Latitude,train.Longitude)

for k,date in enumerate(spray.Date.unique()):
    print 'On date', k, 'of', len(spray.Date.unique())
    spray_points = zip(spray[spray.Date==date].Latitude,spray[spray.Date==date].Longitude)
    matrix = [[0] * len(spray_points)] * len(test_points)

    for i,x in enumerate(test_points):
        if i%5000.0 == 0:
            print 'On test point', i
        for n,y in enumerate(spray_points):
            matrix[i][n] = vincenty(x, y).miles

    dist = [len([x for x in matrix[i] if x < 35]) for i in range(len(test_points))]
    train[date] = dist
    
train.head()