In [1]:
import numpy as np
from sklearn import datasets, linear_model, metrics
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import geopy.distance
import datetime
import csv

def time_from_polyline(polyline):
    return (len(polyline)-1)*15


def list_from_polyline_string(polyline_str):
    final_polyline = []
    c = 0
    while c < len(polyline_str):
        if c == 0 or c == len(polyline_str)-1:
            c += 1
        else:
            coords = []
            if polyline_str[c] == "[":
                c += 1
                d = c
                while polyline_str[d] != ",":
                    d += 1
                coords.append(float(polyline_str[c:d]))
                d += 1
                c = d
                while polyline_str[d] != "]":
                    d += 1
                coords.append(float(polyline_str[c:d]))
                d += 1
                c = d

                final_polyline.append(coords)
                c += 1
            else:
                c += 1

    return final_polyline


def distance_gps_coordinates(c1, c2):
    return geopy.distance.vincenty(c1, c2).miles


def distance_from_polyline(polyline):
    final_dist = 0.00
    for i in range(len(polyline)):
        if i == len(polyline)-1:
            break
        else:
            final_dist += distance_gps_coordinates(polyline[i], polyline[i+1])

    return final_dist

# Returns hour and day of the week
def from_unix_timestamp(ts):
    dt = datetime.datetime.fromtimestamp(ts)
    hour = dt.hour
    week_day = dt.weekday()
    # print(hour, week_day)
    return hour, week_day


In [2]:
taxi_stand_id_to_lat_lon = {}
with open('data/metaData_taxistandsID_name_GPSlocation.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            # print(row)
            line_count += 1
        else:
            taxi_stand_id_to_lat_lon[int(row[0])] = [float(row[2]), float(row[3])]
            line_count += 1

    print(f'Processed {line_count} lines.')


Processed 64 lines.


In [3]:
trip_metrics = []
target_distance_yi = []
trip_time_yi = []
with open('data/train.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            # print(row)
            line_count += 1
        else:
            if row[7] == "True" or row[1] == "A" or row[1] == "C" or row[3] == '':
                continue
            timestamp = int(row[5])
            hour, weekday = from_unix_timestamp(timestamp)
            origin_stand_lat = taxi_stand_id_to_lat_lon[int(row[3])][0]
            origin_stand_lon = taxi_stand_id_to_lat_lon[int(row[3])][1]
            distance_yi = distance_from_polyline(list_from_polyline_string(row[8]))
            time_yi = time_from_polyline(list_from_polyline_string(row[8]))

            trip_metrics.append([hour+1, weekday+1, origin_stand_lat, origin_stand_lon])
            target_distance_yi.append(distance_yi)
            trip_time_yi.append(time_yi)
            line_count += 1
            if line_count % 100000 == 0:
                print(line_count)
    # print(trip_metrics[0:1000])
    print(f'Processed {line_count} lines.')


KeyboardInterrupt: 

In [149]:
trip_metrics_dash = []
for i in range(len(trip_metrics)):
    l = []
    l = trip_metrics[i][:]
    trip_metrics_dash += [l]
# trip_metrics_dash = trip_metrics_dash + trip_metrics

target_distance_dash = []
for i in range(len(target_distance_yi)):
    target_distance_dash.append(target_distance_yi[i])

# target_distance_dash = target_distance_dash + target_distance_yi

xi = np.array(trip_metrics_dash)
yi = np.array(target_distance_dash)
# reg = linear_model.LinearRegression()
# reg.fit(xi, yi)
# distance_pred = reg.predict(xi)
# print(distance_pred)
# mse = mean_squared_error(yi, distance_pred)
# print(reg.coef_)
# print(type(reg.coef_))
# print(reg.intercept_)
# print(reg.score(xi, yi))
# print(mse)


random_forest_reg = RandomForestRegressor()
random_forest_reg.fit(xi, yi)
print(random_forest_reg.feature_importances_)

predicted = random_forest_reg.predict(xi)
for i in range(len(trip_metrics_dash)):
    if i == 0:
        print(type(predicted[i].item()))
        print(predicted[i], trip_metrics_dash[i])
    trip_metrics_dash[i] += [predicted[i].item()]
    
print(trip_metrics_dash[0:10])


[0.2628653  0.41791606 0.09340451 0.22581413]
<class 'float'>
2.770541981877506 [21, 7, 41.1599801853, -8.64198392478]
[[21, 7, 41.1599801853, -8.64198392478, 2.770541981877506], [21, 7, 41.1570891314, -8.6284679801, 8.793808856358321], [21, 7, 41.1632224305, -8.58404677278, 6.81138604639563], [21, 7, 41.1607148883, -8.60424608207, 2.5965127593800563], [21, 7, 41.1549650972, -8.61321698848, 2.654559933886262], [21, 7, 41.1460158298, -8.61257471887, 2.564174115946839], [21, 7, 41.168317889, -8.68917996027, 3.7360540143833716], [22, 7, 41.1570891314, -8.6284679801, 1.6911279234303223], [22, 7, 41.168317889, -8.68917996027, 3.0291819979056323], [22, 7, 41.1570891314, -8.6284679801, 1.6911279234303223]]




In [150]:
trip_metrics_test = []
with open('data/test_public.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            # print(row)
            line_count += 1
        else:
            if row[7] == "True" or row[1] == "A" or row[1] == "C" or row[3] == '':
                continue
            timestamp = int(row[5])
            hour, weekday = from_unix_timestamp(timestamp)
            origin_stand_lat = taxi_stand_id_to_lat_lon[int(row[3])][0]
            origin_stand_lon = taxi_stand_id_to_lat_lon[int(row[3])][1]

            trip_metrics_test.append([hour+1, weekday+1, origin_stand_lat, origin_stand_lon])
            line_count += 1
    # print(trip_metrics[0:1000])
    print(f'Processed {line_count} lines.')


Processed 124 lines.


In [152]:
trip_metrics_dash_test = []
for i in range(len(trip_metrics_test)):
    l = []
    l = trip_metrics_test[i][:]
    trip_metrics_dash_test += [l]
# trip_metrics_dash = trip_metrics_dash + trip_metrics

xi_test = np.array(trip_metrics_dash_test)
predicted_test = random_forest_reg.predict(xi_test)

for i in range(len(trip_metrics_dash_test)):
    trip_metrics_dash_test[i] += [predicted_test[i].item()]
    if i == 0:
        print(trip_metrics_dash_test[i], predicted_test[i])

[14, 4, 41.1486275073, -8.58587660305, 3.922974337933037] 3.922974337933037


In [156]:
trip_metrics_dash_2 = []
trip_metrics_dash_test_2 = []

for i in range(len(trip_metrics_dash)):
    l = []
    l = trip_metrics_dash[i][:]
    trip_metrics_dash_2 += [l]

for i in range(len(trip_metrics_dash_test)):
    l = []
    l = trip_metrics_dash_test[i][:]
    trip_metrics_dash_test_2 += [l]

xi = np.array(trip_metrics_dash_2)
yi = np.array(trip_time_yi)
print(yi.size, xi.size)
linear_time_reg = linear_model.LinearRegression()
linear_time_reg.fit(xi, yi)

xi_test = np.array(trip_metrics_dash_test_2)
predicted_test_2 = linear_time_reg.predict(xi_test)

print(predicted_test_2)

np.savetxt("submission.csv", predicted_test_2, delimiter=",")

9999 49995
[ 624.92917073  622.35820118  624.92917073  707.75891436  625.72192462
  624.92917073  863.89666084  624.92917073  631.86645737  624.92917073
  624.92917073  624.92917073  664.10456594  526.47094839  624.92917073
  624.92917073  580.01805258  624.92917073  624.92917073  622.35820118
  673.49508094  624.92917073  557.88029237  626.2578528   863.89666084
  713.41377099  769.94110133  602.14534317  631.86645737  613.78738268
  679.37174325  673.49508094  624.92917073  624.92917073  597.06953711
  863.89666084  617.96578696  624.92917073  863.89666084  863.89666084
  624.92917073  613.78738268  624.92917073  760.99979014  569.85313246
  605.05716567  598.13319552  610.6791842   622.63080066  598.13319552
  717.93667455  595.69620382  595.69620382  658.41028756  574.48509962
  717.93667455  639.10624199  740.45282933  653.48535209  687.83734445
  595.69620382  595.69620382  595.69620382  760.99979014  791.40927101
  740.45282933  595.69620382  678.70273144  610.6791842   717.9366