In [139]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import boto3
import json
import os
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import models, layers
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# from sklearn.preprocessing import Normalizer
# %run '../extra_fns.ipynb'
# from sklearn.cluster import KMeans
# import matplotlib.pyplot as plt
# import seaborn as sns
# import xgboost
# from sklearn.metrics import mean_absolute_error as mae
# from sklearn.model_selection import GridSearchCV

In [9]:
with open('../config.json') as json_data:
    config = json.load(json_data)

In [10]:
s3 = boto3.client(
    's3',
    aws_access_key_id=config['boto']['aws_access_key_id'],
    aws_secret_access_key=config['boto']['aws_secret_access_key']
)

In [11]:
model_dir = 'trained_models'
try:
    os.makedirs(model_dir)
except Exception as e:
    print(e)

[Errno 17] File exists: 'trained_models'


In [12]:
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

In [13]:
# s3.upload_file(train_file, config['boto']['buckets']['kaggle'], train_file)
# s3.upload_file(src_file_cleaned, boto_config['buckets']['kaggle'], src_file_cleaned)

# s3.download_file(boto_config['buckets']['kaggle'], src_file, src_file)
# s3.download_file(boto_config['buckets']['kaggle'], src_file_cleaned, src_file_cleaned)

In [14]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [15]:
print('% of positive class: {0}'.format(train.target.sum()/train.shape[0]*100))

% of positive class: 6.187017751787352


In [16]:
train['question_length'] = train.question_text.str.len()

In [17]:
train.question_length.describe()

count    1.306122e+06
mean     7.067884e+01
std      3.878428e+01
min      1.000000e+00
25%      4.500000e+01
50%      6.000000e+01
75%      8.500000e+01
max      1.017000e+03
Name: question_length, dtype: float64

In [90]:
seq_maxlen = 100
train = train[train.question_length<=seq_maxlen]

# Split data

In [142]:
# X_train, X_val, y_train, y_val = train_test_split(train.question_text, train.target, 
#                                                   test_size=0.1, 
#                                                   random_state=1, 
#                                                   stratify=train.target)

X_train = train.question_text
y_train = train.target
X_test = test.question_text

# Sequentialize words

In [143]:
tk = Tokenizer()

In [144]:
tk.fit_on_texts(X_train)

In [145]:
X_train = tk.texts_to_sequences(X_train)
# X_val = tk.texts_to_sequences(X_val)
X_test = tk.texts_to_sequences(X_test)

In [146]:
X_train = pad_sequences(X_train, maxlen=seq_maxlen)
# X_val = pad_sequences(X_val, maxlen=seq_maxlen)
X_test = pad_sequences(X_test, maxlen=seq_maxlen)

# Training

In [141]:
model = models.Sequential()

model.add(layers.Embedding())
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

model.summmary()

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=512, validation_split=0.10)

In [None]:
# train['distance'] = train.apply(lambda row : int(distance(
#     (row['pickup_latitude'], row['pickup_longitude']),
#     (row['dropoff_latitude'], row['dropoff_longitude'])).m), axis=1)

# train.pickup_datetime = pd.to_datetime(train.pickup_datetime)

# train['pickup_monthday'] = train.pickup_datetime.dt.day
# train['pickup_weekday'] = train.pickup_datetime.dt.weekday
# train['pickup_hour'] = train.pickup_datetime.dt.hour

In [None]:
# test['distance'] = test.apply(lambda row : int(distance(
#     (row['pickup_latitude'], row['pickup_longitude']),
#     (row['dropoff_latitude'], row['dropoff_longitude'])).m), axis=1)

# test.pickup_datetime = pd.to_datetime(test.pickup_datetime)

# test['pickup_monthday'] = test.pickup_datetime.dt.day
# test['pickup_weekday'] = test.pickup_datetime.dt.weekday
# test['pickup_hour'] = test.pickup_datetime.dt.hour

In [None]:
# train.to_csv('./data/train-clean.csv')
# test.to_csv('./data/test-clean.csv')

train = pd.read_csv('./data/train-clean.csv')
test = pd.read_csv('./data/test-clean.csv')

# Clean data

In [None]:
train = train[(train.trip_duration<3600) & (train.distance<30000) & (train.trip_duration>180)]

In [None]:
sns.scatterplot(x='trip_duration', y='distance', data=train.sample(10000))
plt.show()

In [None]:
train['avg_speed'] = (train.distance*3600)/(train.trip_duration*1000)

In [None]:
train = train[(train.avg_speed<70) & (train.avg_speed>8)]

# Clusterize Geopoints

In [None]:
distortions = []
for i in range(1, 11):
    km = KMeans(n_clusters=i,
                init='k-means++', 
                n_init=10,
                max_iter=300,
                random_state=1)
    km.fit(train[['pickup_latitude', 'pickup_longitude']])
    distortions.append(km.inertia_)

In [None]:
plt.plot(range(1,11), distortions, marker='o')
plt.xlabel('clusters')
plt.ylabel('distortion')
plt.show()

In [None]:
n_clusters = 5
km = KMeans(n_clusters=n_clusters,
                init='k-means++', 
                n_init=10,
                max_iter=300,
                random_state=1)
km.fit(train[['pickup_latitude', 'pickup_longitude']])

In [None]:
train['pickup_geocluster'] = km.predict(train[['pickup_latitude', 'pickup_longitude']])
train['dropoff_geocluster'] = km.predict(train[['dropoff_latitude', 'dropoff_longitude']])

test['pickup_geocluster'] = km.predict(test[['pickup_latitude', 'pickup_longitude']])
test['dropoff_geocluster'] = km.predict(test[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
clusters = list(range(n_clusters))

In [None]:
train = train.join(pd.get_dummies(train.pickup_geocluster, prefix='pickup_geocluster'))
train = train.join(pd.get_dummies(train.dropoff_geocluster, prefix='dropoff_geocluster'))
test = test.join(pd.get_dummies(test.pickup_geocluster, prefix='pickup_geocluster'))
test = test.join(pd.get_dummies(test.dropoff_geocluster, prefix='dropoff_geocluster'))

# Final Data Preparation

In [None]:
pickup_clusters = ['pickup_geocluster_{0}'.format(i) for i in range(n_clusters)]
dropoff_clusters = ['dropoff_geocluster_{0}'.format(i) for i in range(n_clusters)]

In [None]:
cols = ['passenger_count', 'distance', 'pickup_monthday', 'pickup_weekday', 'pickup_hour'] + pickup_clusters + dropoff_clusters

In [None]:
y_train = np.array(train.trip_duration)
X_train = train.loc[:, cols]

X_test = test.loc[:, cols]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)

# NN

In [None]:
callbacks = [
    EarlyStopping(monitor='val_mean_absolute_error', patience=1),
    ModelCheckpoint(filepath=model_dir + '/basic_model.h5', monitor='val_loss', save_best_only=True)
]

In [None]:
nml = Normalizer()

X_train = nml.fit_transform(X_train)
X_val = nml.transform(X_val)
X_test = nml.transform(X_test)

In [None]:
nn = models.Sequential()
nn.add(layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)))
nn.add(layers.Dense(32, activation='relu'))
nn.add(layers.Dropout(0.2))
nn.add(layers.Dense(16, activation='relu'))
nn.add(layers.Dropout(0.2))
nn.add(layers.Dense(1))

nn.compile(loss='mse', optimizer='rmsprop', metrics=['mae'])

nn.summary()

history = nn.fit(X_train, 
                    y_train, 
                    epochs=20, 
                    batch_size=512,
                    callbacks=callbacks,
                    validation_data=(X_val, y_val))

In [None]:
results = history.history
epochs = history.epoch

print('Min Val Absolute Error {0} on Epoch {1}'.format(np.min(results['val_mean_absolute_error']), np.argmin(results['val_mean_absolute_error'])))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(18,6))

ax1.plot(epochs, results['mean_absolute_error'], label='train')
ax1.plot(epochs, results['val_mean_absolute_error'], label='val')
ax1.set_title('Accuracy')
# ax1.set_ylim([0, 1])
ax1.legend()

ax2.plot(epochs, results['loss'], label='train')
ax2.plot(epochs, results['val_loss'], label='val')
ax2.set_title('Loss')
ax2.legend()

plt.show()

# XGBoost

In [None]:
xg = xgboost.XGBRegressor(nthread=-1)

In [None]:
xg.fit(X_train, y_train)

In [None]:
xg_predictions = xg.predict(X_val)

In [None]:
mae(y_val, xg_predictions)

# Ensemble Predictions

In [None]:
nn_predictions = nn.predict(X_test)
xg_predictions = xg.predict(X_test)
xg_predictions = xg_predictions.reshape((nn_predictions.shape[0],1))

In [None]:
predictions = nn_predictions*0.5+xg_predictions*0.5

In [None]:
subs = pd.DataFrame()
subs['id'] = test.id
subs['trip_duration'] = predictions

subs.to_csv(data_dir + '/subs.csv', index=False)