In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv('data/silvercar_reservations.csv')
df_users = pd.read_csv('data/silvercar_users.csv')
df_promos = pd.read_csv('data/silvercar_promotions.csv', encoding="ISO-8859-1")
df_promo_codes = pd.read_csv('data/silvercar_promo_codes.csv', encoding="ISO-8859-1")
df_locs = pd.read_csv('data/silvercar_locations.csv', encoding="ISO-8859-1")

In [3]:
df.head()

Unnamed: 0,id,pickup,dropoff,user_id,status,confirmation_token,created_at,updated_at,booked_price,current_state,...,publisher_id,cancelled_at,local_rental,booked_by_id,awards_referral_bonus,extole_conversion_id,potential_local_rental,claimed_at,customer_modifiable,credit_card_id
0,89,41288.29167,41289.39583,12.0,,5B23231DC0,41269.81361,41417.79529,,cancelled,...,,,0,,0,,,,1,
1,91,41307.79167,41308.79167,49.0,,9A10C66F0C,41269.84853,41417.7953,,cancelled,...,,,0,,0,,,,1,
2,93,41297.41667,41310.83333,51.0,,157C8653CA,41269.86818,41417.7953,,cancelled,...,,,0,,0,,,,1,
3,94,41291.54167,41293.83333,45.0,,C48194AA4F,41269.91132,41417.7953,,cancelled,...,,,0,,0,,,,1,
4,95,41290.97917,41291.79167,55.0,,AABEB48B94,41269.93505,41702.07976,,finished,...,,,0,,0,,,,1,


In [4]:
np.sum(df.isnull())

id                                                 0
pickup                                             0
dropoff                                            0
user_id                                            1
status                                        462564
confirmation_token                                 0
created_at                                         0
updated_at                                         0
booked_price                                  439186
current_state                                      0
booked_daily_rate                             439186
confirmed                                          0
reservation_type_id                                0
pickup_location_id                                 0
dropoff_location_id                                0
rental_agreement_id                           193945
promo_code_id                                 265766
actual_pickup                                 193823
actual_dropoff                                

In [5]:
df["booking_application"].unique()

array([nan, 'web', 'iphone-appstore', 'android', 'kiosk',
       'iphone-concierge', 'gds', 'iphone-debug', 'web-desktop',
       'web-mobile', 'web-tablet'], dtype=object)

In [6]:
df["pickup"] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df["pickup"], 'D')
df["dropoff"] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df["dropoff"], 'D')
df["created_at"] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df["created_at"], 'D')
df["updated_at"] = pd.to_datetime('1899-12-30') + pd.to_timedelta(df["updated_at"], 'D')

In [7]:
print(df["created_at"].min())
print(df["created_at"].max())

2012-12-26 19:31:35.904000
2017-12-14 19:02:15.935999999


In [9]:
df["current_state"].value_counts()

finished                      268088
cancelled                     162600
no_showed                      23013
booked                          6172
payment_declined_cancelled      2065
started                          621
pending_agreement                  5
Name: current_state, dtype: int64

In [33]:
df = df[(df["current_state"] == "cancelled") | (df["current_state"] == "finished")]

In [34]:
df["current_state"] = df["current_state"].map({"cancelled": 1, "finished": 0})

In [35]:
df["time_to_pickup"] = df["pickup"] - df["created_at"]
df["time_to_pickup"] = df["time_to_pickup"].dt.total_seconds() / 86400

In [36]:
df["used_promo"] = (df["promo_code_id"].notnull()).astype(int)
df["same_location"] = (df["pickup_location_id"] == df["dropoff_location_id"]).astype(int)

In [77]:
cols = ["user_id", "current_state", "time_to_pickup", "reservation_frequency", "used_promo"]
df_model = df[cols]
df_model.dropna(inplace=True)
df_model = pd.get_dummies(df_model)

In [78]:
df_users_model = df_users[["id", "sign_in_count"]].set_index("id")

In [79]:
df_model = df_model.join(df_users_model, how="left", on="user_id")
df_model.drop("user_id", axis=1, inplace=True)

In [81]:
y = df_test2.pop("current_state").values
X = df_test2.values
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [82]:
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.74543917415680339

In [83]:
lr.predict_proba(X_test)[:, 1]

array([ 0.19461654,  0.19541119,  0.07161649, ...,  0.08094669,
        0.15945837,  0.4679473 ])

In [84]:
rf = RandomForestClassifier(max_depth=5, n_estimators=20)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.75067856189374393