In [2]:
import numpy as np
import random
import pandas as pd
import json
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [3]:
text=open('uber.json','r')
x=text.read()
y=json.loads(x)
data=pd.DataFrame(y)

This is the raw data

In [4]:
data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,uber_black_user,weekday_pct
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4


Change the timestamps to date types for ease of use

In [5]:
data.last_trip_date=pd.to_datetime(data.last_trip_date)
data.signup_date=pd.to_datetime(data.signup_date)

Drop NaN for ease of use. This may not be a best practice but for an exercise let us assume this is okay.

In [6]:
dat2=data.dropna()

Summary Statistics

In [7]:
dat2.describe()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,uber_black_user,weekday_pct
count,41445.0,41445.0,41445.0,41445.0,41445.0,41445.0,41445,41445.0
mean,5.460046,4.776446,4.60172,1.073822,8.898709,2.609675,0.3840994,61.254897
std,5.145174,0.404606,0.616104,0.198747,18.322061,4.059336,0.4863875,34.813848
min,0.0,1.0,1.0,1.0,0.0,0.0,False,0.0
25%,2.42,4.7,4.3,1.0,0.0,0.0,0,37.5
50%,3.78,5.0,4.9,1.0,0.0,1.0,0,66.7
75%,6.49,5.0,5.0,1.07,11.1,3.0,1,100.0
max,79.69,5.0,5.0,8.0,100.0,125.0,True,100.0


We see that the last trip dates here are 2014-07-01

In [8]:
data.sort('last_trip_date',ascending=False).head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,uber_black_user,weekday_pct
4486,11.94,5.0,5.0,1.0,Winterfell,2014-07-01,Android,2014-01-25,0.0,4,True,25.0
43047,5.49,4.7,5.0,1.01,King's Landing,2014-07-01,iPhone,2014-01-15,0.8,1,True,89.3
24059,8.62,4.9,4.3,1.11,Astapor,2014-07-01,iPhone,2014-01-24,14.3,1,False,71.4
40290,7.51,4.8,4.9,1.02,King's Landing,2014-07-01,iPhone,2014-01-07,1.6,2,True,95.2
28689,16.32,5.0,5.0,1.0,King's Landing,2014-07-01,iPhone,2014-01-16,0.0,5,False,75.0


A quick and dirty function to add a bool value to users who have used Uber between 2014-06-01 and 2014-07-01

In [9]:
dat2.active=dat2.apply(lambda x: x.last_trip_date>pd.to_datetime('2014-06-01'),1)

Frequency of Active users

In [10]:
dat2.active.value_counts()

False    24837
True     16608
dtype: int64

In [11]:
dat2=pd.merge(dat2,
              pd.DataFrame(
    dat2.apply(lambda x: x.last_trip_date>pd.to_datetime('2014-06-01'),1),
columns=['active']),
              left_index=True,right_index=True)

In [12]:
dat2.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,uber_black_user,weekday_pct,active
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2,True
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0,False
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0,False
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0,True
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4,False


The categorical variables which need to be turned into dummies are city and phone. We do not need to code active or uber_balck_user because T/F == 1/0.

In [13]:
dat2=pd.merge(dat2,pd.get_dummies(dat2.phone),left_index=True,right_index=True)

In [14]:
dat2=pd.merge(dat2,pd.get_dummies(dat2.city),left_index=True,right_index=True)

Now let us drop city,phone, Android,last trip date, signup date and Astapor

In [15]:
dat2=dat2.drop(['city','phone', 'Android', 'Astapor','signup_date','last_trip_date'],1)

In [16]:
dat2.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,surge_pct,trips_in_first_30_days,uber_black_user,weekday_pct,active,iPhone,King's Landing,Winterfell
0,3.67,5.0,4.7,1.1,15.4,4,True,46.2,True,1,1,0
1,8.26,5.0,5.0,1.0,0.0,0,False,50.0,False,0,0,0
2,0.77,5.0,4.3,1.0,0.0,3,False,100.0,False,1,0,0
3,2.36,4.9,4.6,1.14,20.0,9,True,80.0,True,1,1,0
4,3.13,4.9,4.4,1.19,11.8,14,False,82.4,False,0,0,1


Now we fit a Random Forest Classifier. First split training and testing

In [17]:
rows=random.sample(dat2.index.tolist(),30000)
not_rows=dat2.index.drop(rows)
x_train = dat2.drop('active',1).ix[rows]
x_test = dat2.drop('active',1).ix[not_rows]
y_train = dat2.active.ix[rows]
y_test = dat2.active.ix[not_rows]

In [18]:
model=RandomForestClassifier(n_estimators=100,n_jobs=-1)

In [19]:
clf = model.fit(x_train,y_train)

The 'Score' of the model. The model predicts accurately 75% of the time.

In [20]:
clf.score(x_test,y_test)

0.75832241153342073

f_1 score

In [21]:
f1_score(y_test,model.predict(x_test))

0.69273494778938016

In [22]:
importances = model.feature_importances_

In [23]:
importances

array([ 0.26940352,  0.11785011,  0.08098199,  0.07015321,  0.08519582,
        0.07895681,  0.03307973,  0.1446923 ,  0.04083101,  0.06244463,
        0.01641086])

In [None]:
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
plt.figure()
plt.title("Feature importances")
plt.bar(range(11), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(11), dat2.columns[indices],rotation='vertical')
plt.xlim([-1, 11])
plt.margins(0.1)
plt.subplots_adjust(bottom=0.25)
plt.show()

Here we can use the average of each variable and toggle the highly effective. We could do this more systematically and plot results.

In [128]:
z.at['avg_dist']=20
z.at['avg_rating_by_driver']=4.7
z.at['avg_rating_of_driver']=4.6
z.at['avg_surge']=.107
z.at['surge_pct']=8.9
z.at['trips_in_first_30_days']=2.6
z.at['uber_black_user']=False
z.at['weekday_pct']=.9
z.at['iPhone']=True
z.at["King's Landing"]=False
z.at["Winterfell"]=True

In [129]:
model.predict(z)

array([False], dtype=bool)