In [None]:
import pandas as pd
from lib.preprocessing import *
from lib.cleaning import *
import seaborn as sns

from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, SGDRegressor   
from sklearn.neighbors import KNeighborsRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
# from mpl_toolkits.basemap import Basemap
# import matplotlib.pyplot as plt
# %matplotlib inline
# import missingno as msno

### Exploration

In [None]:
df = load_sample(percentage=1)

In [None]:
cols = pd.Series(df.columns.format())

metrics = ["Temperature", "Precipitation in millimeters", "Distance (KM)"]  # 3 of these

ids = pd.concat([cols.loc[cols.str.match(".*Id.*")], cols.loc[cols.str.match(".*No.*")]]) # 3 of these

location = ["Pickup Lat", "Pickup Long", "Destination Lat", "Destination Long"] # 4 of these
times = cols.loc[cols.str.match(".*Time.*")] # 6(actually 4) of these

types = ["Vehicle Type", "Platform Type", "Personal or Business", ] # 3 of these


In [None]:
times

In [None]:
ids

In [None]:
desc = pd.read_csv("variable_definitions.csv")
desc

In [None]:
sns.histplot(df.Temperature, bins=35)

In [None]:
df.loc[(df['Placement - Day of Month'] < df["Pickup - Day of Month"])]

### Cleaning Data

#### Nelson cleaned data

In [None]:
df = load_sample(percentage=1, csv_file="data/train_full.csv")
df = drops(df)
df = impute_temperature(df)
df = combine_weekdays(df)
df.head(3)

In [None]:
df.

In [None]:
sns.histplot(df.Temperature, bins=35)

#### Emily Cleaning

In [None]:
data = load_sample(percentage = 1, sql_db='data/cleaned_nelson.db')

for i in ['Order No', 'User Id', "Rider Id"]:
    under = i.replace(' ', '_')
    data.rename(columns={i: under}, inplace=True)
    data[under] = data[under].str.replace(f'{under}_', '')
    data[under] = data[under].astype(int)

save_data(data, sql_db='data/cleaned_emily.db')

#### Merging Emily w/ Nelson

In [None]:
em = load_sample(percentage=1, sql_db="data/cleaned_emily.db")
em.rename({
    "Order_No" : "order_no",
    "User_Id" : "user_id",
    "Rider_Id" : "rider_id"
}, axis=1, inplace=True)
em.head(3)

In [None]:
save_data(em, sql_db="data/cleaned_emily_nelson.db")

#### Alex Cleaning

In [None]:
riders = pd.read_csv("data/riders.csv")

# rename cols-- easier to work with
riders.rename(columns= {
    "Rider Id": "id",
    "No_Of_Orders": "orders",
    "Age": "age",
    "Average_Rating": "average_rating",
    "No_of_Ratings": "number_rating" 
}, inplace=True)

# drop "Rider_Id_" in "id" column
riders.id.replace('Rider_Id_', ' ',regex=True,inplace=True)

riders.head(3)

In [None]:
save_data(riders, sql_db="data/cleaned_Alex.db")

#### Merging Alex w/ Emily+Nelson

In [None]:
riders_clean = load_sample(percentage=1, sql_db="data/cleaned_Alex.db")
riders_clean["id"] = riders_clean["id"].astype(int)
riders_clean.set_index("id", inplace=True)
riders_clean.head(3)
# riders_clean.dtypes

In [None]:
other_clean = load_sample(percentage=1, sql_db="data/cleaned_emily_nelson.db")
other_clean.head(3)

In [None]:
cleaned_all = other_clean.join(riders_clean, on="rider_id", rsuffix="_rider")
cleaned_all.head(3)

In [None]:
save_data(cleaned_all, sql_db="data/cleaned_alex_emily_nelson.db")

### Model testing

In [None]:
df = load_sample(percentage=0.5, sql_db="data/cleaned_alex_emily_nelson.db")

df["Placement - Time"] = get_seconds_from_dt_series(df['Placement - Time'])
df["Confirmation - Time"] = get_seconds_from_dt_series(df['Confirmation - Time'])
df["Arrival at Pickup - Time"] = get_seconds_from_dt_series(df['Arrival at Pickup - Time'])
df["Pickup - Time"] = get_seconds_from_dt_series(df['Pickup - Time'])
df["Business"] = (df['Personal or Business'] == "Business").astype(float)
df["place_to_confirm"] = df["Confirmation - Time"] - df["Placement - Time"]
df["confirm_to_pick_arr"] = df["Arrival at Pickup - Time"] - df["Confirmation - Time"]
df["pick_arr_to_pick"] = df['Pickup - Time'] - df["Arrival at Pickup - Time"]
df['platform_4'] = (df['Platform Type'] == 4).astype(float)
df.drop(columns=["Personal or Business", "Vehicle Type", "Platform Type"], inplace=True)

dont_scale = ["order_no", "user_id", "rider_id", "Business", "platform_4", 'Time from Pickup to Arrival']

scaler = StandardScaler()
X = scaler.fit_transform(df[df.columns[~df.columns.isin(dont_scale)]])
X = pd.DataFrame(X, columns = df.columns[~df.columns.isin(dont_scale)])
df = pd.concat([X, df[df.columns[df.columns.isin(dont_scale)]]], axis=1)

df.head(3)

In [None]:
df.columns.format()

In [None]:
train, test = train_test_split(df, test_size=0.2)
train.info()

In [None]:
sns.pairplot(train[[
    # "Placement - Day of Month", 
    # "Placement - Weekday (Mo = 1)", 
    # "Placement - Time", 
    # "Confirmation - Time",
    # "Arrival at Pickup - Time",
    # "Pickup - Time",
    "place_to_confirm",
    "confirm_to_pick_arr",
    "pick_arr_to_pick",
    "Distance (KM)",
    "Temperature",
    # "Pickup Lat",
    # "Pickup Long",
    # "Destination Lat",
    # "Destination Long",
    # "Fulfillment - Weekday (Su = 0)",
    # "Fulfillment - Day of Month",
    "orders",
    "age",
    "average_rating",
    "number_rating",
    "Business"
]])

In [None]:
def assess_model(model) -> pd.DataFrame:
    return pd.DataFrame(cross_validate(
        model, 
        train[train.columns[train.columns!='Time from Pickup to Arrival']],
        train[['Time from Pickup to Arrival']],
        scoring={"neg_mse": "neg_mean_squared_error", "neg_mae": "neg_mean_absolute_error"}
    ))

models = [
    (DecisionTreeRegressor(), "DTree"),
    (LinearRegression(), "OLS"),
    (Lasso(), "Lasso"),
    (Ridge(), "Ridge"),
    (KNeighborsRegressor(n_neighbors=20), "20-KNN"),
    (SGDRegressor(), "SGD"),
    (GaussianProcessRegressor(), "Gaussian"),
    (MLPRegressor(), "Dense NN")
]

In [None]:
import warnings
warnings.filterwarnings('ignore')
for model, name in models:
    print(name, ':')
    print(assess_model(model))
    print()
warnings.filterwarnings('default')