In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt


# Part 1: EDA

In [None]:
logins = pd.read_json('logins.json')
logins.head()

In [None]:
logins.dtypes

login_time is already of type datetime

In [None]:
logins.describe()

login time spans around 4 months

In [None]:
logins.isnull().sum()

In [None]:
logins.sort_values('login_time').head()

### Aggregation into 15 minute Intervals

In [None]:
interval = logins.resample('15Min', on='login_time').count()
interval.columns = ['count']
interval.head()

### Visualizing logins over different time frames

In [None]:
sns.histplot(interval['count'], binwidth=5)
plt.xlabel(None)
plt.ylabel(None)
plt.title('Distribution of 15-Minute Time Interval Log-in Counts')

Most of our 15 minute intervals had login counts between 0 - 20 

In [None]:
plt.figure(figsize=[15,5])
plt.plot(interval)
plt.title('Login Counts Over Entire Period')
plt.show()

We see that there big spikes in the number of logins

Let's take a closer look at daily time period

**Mean Login counts for each day of the week**

In [None]:
interval['week']=interval.index.week
interval['day']=interval.index.day
interval['day_of_week'] = interval.index.dayofweek # Monday starts at index 0
interval['hour'] = interval.index.hour

In [None]:
interval.groupby('day_of_week')['count'].mean()

In [None]:
plt.figure(figsize=(8,5))
sns.lineplot(x='week', y='count', data=interval)
plt.title('Mean Login Count Each Week')
plt.show()

**Are there certain days each month that we see a boost in logins?**

In [None]:
January = interval['1970-01-01': '1970-01-31']
February = interval['1970-02-01':'1970-02-28']
March = interval['1970-03-01':'1970-03-31']
April = interval['1970-04-01':interval.index.max()]

In [None]:
fig, axes = plt.subplots(2,2, figsize=(15,8), sharey=True, sharex=True)

sns.lineplot(x='day', y='count', data=January, ax=axes[0][0])
axes[0][0].set_title('January')

sns.lineplot(x='day', y='count', data=February, ax=axes[0][1])
axes[0][1].set_title('February')

sns.lineplot(x='day', y='count', data=March, ax=axes[1][0])
axes[1][0].set_title('March')

sns.lineplot(x='day', y='count', data=April, ax=axes[1][1])
axes[1][1].set_title('April')

fig.suptitle('Daily Average Login Count for Each Month', fontsize=15)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
ax1 = plt.subplot(1, 2, 1)
sns.lineplot(x='day_of_week', y='count', data=interval, ci=None, ax=ax1)
ax1.set_xticks(range(7))
ax1.set_xticklabels(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
ax1.set_ylabel('mean count')


ax2 = plt.subplot(1, 2, 2)
sns.boxplot(x='day_of_week', y='count', data=interval, ax=ax2)
ax2.set_xticklabels(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

plt.suptitle('Login Counts Per Day of Week', fontsize=15)
plt.show()

There is an increase in Login Counts as we approach the weekend with a spike when the weekend hits.

In [None]:
plt.figure(figsize=(10, 5))
sns.lineplot(x='hour', y='count', data=interval, ci=None)
plt.xticks(range(24))
plt.show()

There is a steep drop in login counts from 6am to 9am and minor drop again from 1 pm to 7pm.

# Part 2 - Experiment and Metrics Design
The neighboring cities of Gotham and Metropolis have complementary circadian rhythms: on
weekdays, Ultimate Gotham is most active at night, and Ultimate Metropolis is most active
during the day. On weekends, there is reasonable activity in both cities.

However, a toll bridge, with a two-way toll, between the two cities causes driver partners to tend
to be exclusive to each city. The Ultimate managers of city operations for the two cities have
proposed an experiment to encourage driver partners to be available in both cities, by
reimbursing all toll costs.

**1.  What would you choose as the key measure of success of this experiment in
encouraging driver partners to serve both cities, and why would you choose this metric?**

Assuming that Ultimate drivers utilize an app of some sort, we can use the geolocation data from the app and count the total times a driver crosses the bridge into the other city. Since, the goal is to get our driver partners available in both cities, tallying the number of times they cross into the other city each day will reveal if drivers are taking rides in the other city. And for those drivers that do, the metric will tell us how many trips they are making between each city . 

**2. Describe a practical experiment you would design to compare the effectiveness of the proposed change in relation to the key measure of success. Please provide details on:**

**a.) how you will implement the experiment**

First create a group of the drivers who drive exclusively to one city. 
From this group, create two randomly selected roughly equal in size sub groups:
Drivers who continue to not receive refunds for tolls
Drivers who have been notified that they will receive a refund for toll crossings

**b.) what statistical test(s) you will conduct to verify the significance of the observation?**

Hypothesis testing (A/B testing): Randomized controlled experiment comparing the means of key metrics between our two groups
Null hypothesis: there is no statistically significant difference between the means of the  two groups
Alternative Hypothesis: There is a statistically significant difference between our control group and the refund group

**c.) how you would interpret the results and provide recommendations to the city operations team along with any caveats**

If the p-value is less than a significance level threshold  of 0.05, we can reject the null hypothesis. This would mean that it is very likely that issuing refunds indeed encourages drivers to take riders in other cities. We could recommend this alternative hypothesis after a few more experiments. We would also recommend that the city operations team review some potential caveats for making any decisions

Caveats:

* Just because a driver crosses the bridge frequently doesn’t mean they are assigned many trips. It could be that drivers are just making many inter-city trips, or that a driver is simply crossing the bridge without taking a trip. 
* Another metric such as total time in other city = `trips in other city x avg(duration of a trip)` should also be considered
* Need to ensure that the costs of refunding toll bridges does not outweigh the additional generated revenue of drivers taking more trips in the other city. Ultimate would want drivers to stay in highly active areas once there.


# Part 3: Predictive Modeling

## Data Wrangling

In [None]:
from datetime import timedelta

In [None]:
df = pd.read_json('ultimate_data_challenge.json')
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum().sort_values(ascending=False)

There are missing values for 3 columns

In [None]:
df.describe()

In [None]:
df['avg_rating_of_driver'].hist()

In [None]:
df['avg_rating_by_driver'].hist()

The values for these two columns are concentrated, so let's impute uisng median

In [None]:
df['avg_rating_of_driver'].fillna(df['avg_rating_of_driver'].median(), inplace=True)
df['avg_rating_by_driver'].fillna(df['avg_rating_by_driver'].median(), inplace=True)

In [None]:
# 'phone' column is a categorical variable. Let's impute with NA for now
df['phone'] = df['phone'].fillna('No Info')

### Creating Retention Variable

Change date columns to datetime

In [None]:
df[['signup_date','last_trip_date']] = df[['signup_date','last_trip_date']].apply(pd.to_datetime)

In [None]:
df.info()

In [None]:
df.last_trip_date.max()

Last recorded date is July 1. Let's take all observations with last trip date between July 1 and 30 days before July 1 and label them as active users

In [None]:
cutoff = df.last_trip_date.max() - timedelta(days=30) #cut off date for active user

df['retained'] = df['last_trip_date'].apply(lambda x: True if x>=cutoff else False)

In [None]:
df['retained'].value_counts()

In [None]:
(df['retained']==True).sum() / len((df['retained']))

More users are inactive than active as defined by Ultimate's criteria. Only 37.6% of the total users are retained

## EDA

**Heatmap of Correlations**

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), annot=True)

Some correlation exists between `ultimate_black_user`, `trip_in_first_30_days`, and our target variable `retained`

There is also high pairwise correlation between `avg_surge` and `surge_pct`

Let's explore some of these variables more by looking at their distributions while considering retained

**Trips in the first 30 days**

In [None]:
sns.boxplot(x='retained', y='trips_in_first_30_days', data=df)

Retained users are slighlty more likely to take more trips in their first 30 days. There are outliers for both groups

**Ultimate Black User**

In [None]:
sns.countplot(x='ultimate_black_user', data=df, hue='retained')
plt.show()

Non-ultimate black  users are less likely to be retained

**City user signed up in**

In [None]:
sns.countplot(x='city', data=df, hue='retained')
plt.show()

Users who signed up in King's landing were more likely to be retained than not retained. Astapor City and Winterfell experience many users who do not continue with the app

**Primary phone device for user**

In [None]:
sns.countplot(x='phone', data=df, hue='retained')
plt.show()

Percentage wise, android users have a lower retention rate. Could there be something wrong in how our android app interface?

# Predictive Modeling

Let's try two classification alogorithms: linear and tree based

**Import Modules**

In [None]:
#Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Modeling
from sklearn.pipeline import Pipeline

from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

#Evaluation
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import plot_confusion_matrix



## Preprocessing

In [None]:
# change categorical variables into numeric through dummy variable encoding
df = pd.get_dummies(df, drop_first=True)

In [None]:
# drop unnecessary columns for X column
X = df.drop(['last_trip_date', 'signup_date', 'retained'], axis=1)
y = df['retained']

In [None]:
# split data into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

**Scoring Report Function**

In [None]:
score_df = pd.DataFrame(columns=['accuracy','precision','recall','f1_score','auc_score'])

def score_model(model_name, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 =  f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    
    score_df.loc[model_name,:] = accuracy, precision, recall, f1, auc 

## Dummy Model

In [None]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
y_pred = dummy.predict(X_test)

In [None]:
score_model('dummy', y_test, y_pred)
score_df.iloc[-1:]

## Logistic Regression

In [None]:
lr = Pipeline(steps=[('scaler', StandardScaler()), ('clf',LogisticRegression())]) # pipeline of transform w/ final estimator

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

In [None]:
score_model('lr', y_test, y_pred)
score_df.iloc[-1:]

In [None]:
plot_confusion_matrix(lr, X_test, y_test, cmap='Blues')

#### Hyperparameter Tuning

In [None]:
param_grid = {'clf__C':[0.001, 0.01, .01, 1, 10, 100], 
              'clf__class_weight':['balanced',None]}

pipe = Pipeline(steps=[('scaler', StandardScaler()), ('clf',LogisticRegression())])
lr_tuned = GridSearchCV(pipe, param_grid=param_grid, cv=5)

lr_tuned.fit(X_train, y_train)
y_pred = lr_tuned.predict(X_test)

print('best hyperparameters:', lr_tuned.best_params_)
score_model('lr_tuned', y_test, y_pred)
score_df.iloc[-1:]

## Random Forest

In [None]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

score_model('rf', y_test, y_pred)
score_df.iloc[-1:]

#### Hyperparameter Tuning

In [None]:
param_grid = {'n_estimators':[int(x) for x in np.linspace(start=100, stop=1000, num=10)],
             'max_depth':[int(x) for x in np.linspace(5, 25, num=5)],
             'min_samples_split':np.arange(2, 12, 2), # mininum number of samples required to split a node
             'min_samples_leaf': np.arange(2, 12, 2), # minimum number of samples required for a leaf node
             'max_features': np.arange(2, 12, 2)}

rf_tuned = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv=5, n_jobs=-1, n_iter=250)
rf_tuned.fit(X_train, y_train)
y_pred = rf_tuned.predict(X_test)

score_model('rf_tuned', y_test, y_pred)
score_df.iloc[-1:]

### Recommendations

In [None]:
score_df

Ultimate can use the model to predict which users are at risk of becoming non-active and then use that information to target them with advertisements and promotions to encourage them to reuse the app. Ulimate could also send promotions or surveys to the active users to understand why they use their service. 