In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df = pd.read_csv('data/silvercar_reservations.csv')
df_users = pd.read_csv('data/silvercar_users.csv')
df_promos = pd.read_csv('data/silvercar_promotions.csv', encoding="ISO-8859-1")
df_promo_codes = pd.read_csv('data/silvercar_promo_codes.csv', encoding="ISO-8859-1")
df_locs = pd.read_csv('data/silvercar_locations.csv', encoding="ISO-8859-1")

In [13]:
df.head()

Unnamed: 0,id,pickup,dropoff,user_id,status,confirmation_token,created_at,updated_at,booked_price,current_state,...,publisher_id,cancelled_at,local_rental,booked_by_id,awards_referral_bonus,extole_conversion_id,potential_local_rental,claimed_at,customer_modifiable,credit_card_id
0,89,41288.29167,41289.39583,12.0,,5B23231DC0,41269.81361,41417.79529,,cancelled,...,,,0,,0,,,,1,
1,91,41307.79167,41308.79167,49.0,,9A10C66F0C,41269.84853,41417.7953,,cancelled,...,,,0,,0,,,,1,
2,93,41297.41667,41310.83333,51.0,,157C8653CA,41269.86818,41417.7953,,cancelled,...,,,0,,0,,,,1,
3,94,41291.54167,41293.83333,45.0,,C48194AA4F,41269.91132,41417.7953,,cancelled,...,,,0,,0,,,,1,
4,95,41290.97917,41291.79167,55.0,,AABEB48B94,41269.93505,41702.07976,,finished,...,,,0,,0,,,,1,


In [214]:
np.sum(df.isnull())

id                                                 0
pickup                                             0
dropoff                                            0
user_id                                            1
status                                        462564
confirmation_token                                 0
created_at                                         0
updated_at                                         0
booked_price                                  439186
current_state                                      0
booked_daily_rate                             439186
confirmed                                          0
reservation_type_id                                0
pickup_location_id                                 0
dropoff_location_id                                0
rental_agreement_id                           193945
promo_code_id                                 265766
actual_pickup                                 193823
actual_dropoff                                

In [215]:
df["booking_application"].unique()

array([nan, 'web', 'iphone-appstore', 'android', 'kiosk',
       'iphone-concierge', 'gds', 'iphone-debug', 'web-desktop',
       'web-mobile', 'web-tablet'], dtype=object)

In [216]:
df["pickup"] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df["pickup"], 'D')
df["dropoff"] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df["dropoff"], 'D')
df["created_at"] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df["created_at"], 'D')
df["updated_at"] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df["updated_at"], 'D')

In [217]:
print(df["created_at"].min())
print(df["created_at"].max())

2012-12-26 19:31:35.904000
2017-12-14 19:02:15.935999999


In [218]:
df.head()

Unnamed: 0,id,pickup,dropoff,user_id,status,confirmation_token,created_at,updated_at,booked_price,current_state,...,publisher_id,cancelled_at,local_rental,booked_by_id,awards_referral_bonus,extole_conversion_id,potential_local_rental,claimed_at,customer_modifiable,credit_card_id
0,89,2013-01-14 07:00:00.288,2013-01-15 09:29:59.712,12.0,,5B23231DC0,2012-12-26 19:31:35.904,2013-05-23 19:05:13.056,,cancelled,...,,,0,,0,,,,1,
1,91,2013-02-02 19:00:00.288,2013-02-03 19:00:00.288,49.0,,9A10C66F0C,2012-12-26 20:21:52.992,2013-05-23 19:05:13.920,,cancelled,...,,,0,,0,,,,1,
2,93,2013-01-23 10:00:00.288,2013-02-05 19:59:59.712,51.0,,157C8653CA,2012-12-26 20:50:10.752,2013-05-23 19:05:13.920,,cancelled,...,,,0,,0,,,,1,
3,94,2013-01-17 13:00:00.288,2013-01-19 19:59:59.712,45.0,,C48194AA4F,2012-12-26 21:52:18.048,2013-05-23 19:05:13.920,,cancelled,...,,,0,,0,,,,1,
4,95,2013-01-16 23:30:00.288,2013-01-17 19:00:00.288,55.0,,AABEB48B94,2012-12-26 22:26:28.320,2014-03-04 01:54:51.264,,finished,...,,,0,,0,,,,1,


In [219]:
df["current_state"].value_counts()

finished                      268088
cancelled                     162600
no_showed                      23013
booked                          6172
payment_declined_cancelled      2065
started                          621
pending_agreement                  5
Name: current_state, dtype: int64

In [220]:
# df["promo_code_id"].dropna(inplace=True)
# for promo in df["promo_code_id"].unique():
#     if promo not in set(df_promo_codes["id"]):
#         print(promo)

In [221]:
df_test = df[(df["current_state"] == "cancelled") | (df["current_state"] == "finished")]

In [222]:
df_test["current_state"] = df_test["current_state"].map({"cancelled": 1, "finished": 0})

In [223]:
df_test["time_to_pickup"] = df_test["pickup"] - df_test["created_at"]
df_test["time_to_pickup"] = df_test["time_to_pickup"].dt.total_seconds() / 86400

In [224]:
df_test["used_promo"] = (df_test["promo_code_id"].notnull()).astype(int)
df_test["same_location"] = (df_test["pickup_location_id"] == df_test["dropoff_location_id"]).astype(int)

In [233]:
cols = ["current_state", "time_to_pickup", "reservation_frequency", "used_promo"]
df_test2 = df_short[cols]

In [234]:
df_test2 = df_test2.dropna()
df_test2 = pd.get_dummies(df_test2)

In [235]:
y = df_test2.pop("current_state").values
X = df_test2.values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [236]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.74545029812227459

In [207]:
lr.predict_proba(X_test)[:, 1]

array([ 0.16161631,  0.16542135,  0.23284419, ...,  0.27680468,
        0.24271922,  0.16871245])

In [208]:
rf = RandomForestClassifier(max_depth=5, n_estimators=20)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.74945492569191063

In [238]:
df_promo_codes.head()

Unnamed: 0,id,code,created_at,updated_at,promotion_id,promo_code_batch_id,active
0,1,78S7S0DJS9,03:07.0,21:16.0,18,,0
1,2,HHD8S797SS,03:08.0,21:40.0,19,,0
2,3,XX7648743A,03:08.0,22:03.0,20,,0
3,4,KPDFWMUC05,19:25.0,10:42.0,21,,0
4,5,KPDFWMUC10,23:27.0,11:10.0,22,,0
