## DATA IMPORT

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor


# To make run all you ask in one cell, not only the last required
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Format
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
final_data = pd.read_csv('taxi_model.csv')

## DATA TRANSFORMATION

In [3]:
# separate x/y
y = final_data['tip_amount']
X= final_data.drop(['tip_amount'], axis=1)
# Splitting into train set and test set.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

In [4]:
# TRAIN DATASET
numerical_train = X_train.select_dtypes(include = np.number)
# scaling standard scaler: make data normal distributed with mean=0 and std=1
transformer = StandardScaler().fit(numerical_train)
num_standardized = transformer.transform(numerical_train)
train_num_stand = pd.DataFrame(num_standardized, columns=numerical_train.columns)
# categorical
categorical_train = X_train.select_dtypes(include = np.object)
# encode categorical
encoder = OneHotEncoder().fit(categorical_train)
train_onehot = encoder.transform(categorical_train).toarray()
enc = [columname for sublist in encoder.categories_ for columname in sublist]
train_onehot = pd.DataFrame(train_onehot,columns=enc)
# final train dataset
numerical_train.reset_index(drop=True, inplace=True)
train_onehot.reset_index(drop=True, inplace=True)
X_train_preproc = pd.concat([train_num_stand,train_onehot], axis= 1)

In [5]:
# TEST DATASET
numerical_test = X_test.select_dtypes(include = np.number)
# scaling standard scaler: make data normal distributed with mean=0 and std=1
test_num_standardized = transformer.transform(numerical_test)
test_num_stand = pd.DataFrame(test_num_standardized, columns=numerical_test.columns)
# categorical
categorical_test = X_test.select_dtypes(include = np.object)
# encode categorical
test_onehot = encoder.transform(categorical_test).toarray()
enc = [columname for sublist in encoder.categories_ for columname in sublist]
test_onehot = pd.DataFrame(test_onehot,columns=enc)
# final test dataset
X_test_preproc = pd.concat([test_num_stand,test_onehot], axis= 1)
# X_test_preproc.head()

In [6]:
numerical_test.columns
numerical_test.head()

Index(['passenger_count', 'trip_distance', 'extra', 'tolls_amount',
       'trip_duration', 'trip_average_speed'],
      dtype='object')

Unnamed: 0,passenger_count,trip_distance,extra,tolls_amount,trip_duration,trip_average_speed
371207,2.0,1.61,2.5,0.0,4.0,24.14
2132346,1.0,3.03,0.0,0.0,18.0,10.09
3920829,1.0,3.46,0.0,0.0,17.0,12.21
1162309,1.0,1.29,2.5,0.0,7.0,11.04
807887,1.0,2.38,0.5,0.0,9.0,15.88


In [7]:
categorical_test.columns
categorical_test.head()

Index(['ratecodeID', 'day_part', 'weekday', 'passenger_load'], dtype='object')

Unnamed: 0,ratecodeID,day_part,weekday,passenger_load
371207,Standard rate,Afternoon,Friday,Small group
2132346,Standard rate,Midday,Thursday,Individual passenger
3920829,Standard rate,Midday,Thursday,Individual passenger
1162309,Standard rate,Morning,Thursday,Individual passenger
807887,Standard rate,Night,Monday,Individual passenger


### MODELING AND VALIDATION

In [8]:
LR = linear_model.LinearRegression()
LR.fit(X_train_preproc,y_train)
y_pred= LR.predict(X_test_preproc)
r2 = r2_score(y_test,y_pred)
print(y_pred[0])
print(r2)

LinearRegression()

1.753662109375
0.6577712208355002


In [9]:
y_pred[0].round(2)

1.75

In [10]:
# X_test_preproc.loc[0]

In [11]:
def questions():
    passenger = int(input("How many are you? "))
    distance = int(input("Where are you going? "))
    duration = int(input("How long do you want to take? "))   
    numerical_client = pd.DataFrame({"passenger_count":passenger,
                 "trip_distance" : distance,
                 "extra" : 0.30,
                 "tolls_amount" : 0.50,
                 "trip_duration": duration,
                 "trip_average_speed":35}, index=[0])
    
    weekday = input("When are you going to travel? ")
    day_part = input("What time? ")
    categorical_client = pd.DataFrame({"ratecodeID":"Standard rate",
                 "day_part" : day_part,
                 "weekday" : weekday,
                 "passenger_load":"Small group"}, index=[0])
    # scaling standard scaler: make data normal distributed with mean=0 and std=1
    test_num_standardized = transformer.transform(numerical_client)
    test_num_stand = pd.DataFrame(test_num_standardized, columns=numerical_client.columns)
    # categorical
    # encode categorical
    test_onehot = encoder.transform(categorical_client).toarray()
    enc = [columname for sublist in encoder.categories_ for columname in sublist]
    test_onehot = pd.DataFrame(test_onehot,columns=enc)
    # final test dataset
    X_test_preproc = pd.concat([test_num_stand,test_onehot], axis= 1)
    # X_test_preproc.head()
    
    return print('Passengers have tipped an average amount $'+ str(LR.predict(X_test_preproc)[0].round(2))+' for this trip. Don\'t forget to tip if you are happy with the service!')

In [23]:
questions()

How many are you? 2
Where are you going? 5
How long do you want to take? 25
When are you going to travel? Friday
What time? Evening
Passengers have tipped an average amount $3.8 for this trip. Don't forget to tip if you are happy with the service!
