In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline

## Loading Dataset

In [None]:
df = pd.read_csv("drive/MyDrive/Dataset/rideshare_kaggle.csv")
df

In [None]:
df.shape

In [None]:
df.info()

## Data Cleaning

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=['price']).reset_index()

In [None]:
df.isnull().sum()

In [None]:
# Hapus fitur yang tidak memiliki dependency terhadap predictand (price)
# karena sudah ada atribut jarak dan parameter waktu (hours, day, month)
# hapus fitur latitude, longitude dan datetime dari datafram
df = df.drop(['id','timestamp','datetime','long_summary','apparentTemperatureHighTime','apparentTemperatureLowTime',
             'windGustTime','sunriseTime','sunsetTime','uvIndexTime','temperatureMinTime','temperatureMaxTime',
             'apparentTemperatureMinTime','temperatureLowTime','apparentTemperatureMaxTime','latitude','longitude'],
            axis=1)
print(df.shape)
df.head()

In [None]:
# cek korelasi antara fitur price dan fitur yang memiliki relasi dengan temperature
new_df = df[['price','temperature','apparentTemperature','temperatureHigh','temperatureLow','apparentTemperatureHigh',
            'apparentTemperatureLow','temperatureMin','temperatureHighTime','temperatureMax','apparentTemperatureMin',
            'apparentTemperatureMax']]
new_df.head()

In [None]:
# gunakan heatmap plot dengan correlation untuk melihat rate korelasi pada dataframe baru
plt.figure(figsize=(10,8))
sns.heatmap(new_df.corr(), annot=True, mask=np.triu(new_df.corr()))

Hapus semua fitur yang berhubungan dengan temperatur dari dataframe karena mereka memiliki korelasi yang lemah terhadap predictand (price)

In [None]:
new_df = df[['temperature','apparentTemperature','temperatureHigh','temperatureLow','apparentTemperatureHigh',
            'apparentTemperatureLow','temperatureMin','temperatureHighTime','temperatureMax','apparentTemperatureMin',
            'apparentTemperatureMax']]
df = df.drop(new_df.columns, axis=1)
print(df.shape)
df.head()

In [None]:
# eksplorasi dan analisis data pada fitur dengan tipe categorical
categorical_cols = df.select_dtypes(include=['object','category']).columns.tolist()
new_data = df[categorical_cols]
new_data.head()

In [None]:
# cek nilai unik pada setiap kolom yang bertipe categorical
for col in new_data:
    print(f"{col} :{new_data[col].unique()}")
    print()

In [None]:
new_data['product_id'].value_counts()

In [None]:
# hapus timezone karena sebagian besar data nya (>50%) hanya berisi data sampah
df = df.drop(['timezone','product_id'], axis=1)
df.head()

In [None]:
#analisis kolom/fitur yang memiliki tipe numerical
num_cols = df.select_dtypes(include=['int64','float64']).columns.tolist()
new_data = df[num_cols]
new_data.columns

In [None]:
#analisis dan cek tingkat korelasi antara fitur yang berhubungan dengan cuaca dan price
climate_cols = ['price','precipIntensity','precipProbability','humidity','windSpeed','windGust',
               'visibility','dewPoint','pressure','windBearing','cloudCover','uvIndex','visibility.1',
               'ozone','moonPhase','precipIntensityMax',]
new_data = df[climate_cols]
new_data.head()

In [None]:
plt.figure(figsize=(12,10))
sns.heatmap(new_data.corr(), annot=True, mask=np.triu(new_data.corr()))

semua fitur yang berhubungan dengan cuaca memiliki korelasi yang rendah hampir 0 terhadap kolom price. drop kolom kolom tersebut

In [None]:
climate_cols = ['precipIntensity','precipProbability','humidity','windSpeed','windGust',
               'visibility','dewPoint','pressure','windBearing','cloudCover','uvIndex','visibility.1',
               'ozone','moonPhase','precipIntensityMax']
df = df.drop(climate_cols, axis=1)
print(df.shape)
df.head()

In [None]:
df.shape

In [None]:
df.head()

## Exploratory Data Analysis

## Data Preprocessing

In [None]:
# Binary encode kolom cab_type
df['cab_type'] = df['cab_type'].replace({'Lyft': 0, 'Uber': 1})

In [None]:
#Encoding semua kolom bertipe cateogory dengan onehot encoder
from sklearn.preprocessing import OneHotEncoder
categorical_cols = df.select_dtypes(include=['object','category']).columns.tolist()
print(categorical_cols)

In [None]:
#inisiasi OneHotEncoder dan menggabungkan original dataframe dengan kolom encode ke dataframe
for col in categorical_cols:
        encoder = OneHotEncoder(handle_unknown='ignore')
        encoder_df = pd.DataFrame(encoder.fit_transform(df[[col]]).toarray())
        encoder_df.columns = encoder.get_feature_names_out([col])
        df = df.drop(col, axis=1)
        df = pd.concat([df, encoder_df], axis=1)

In [None]:
df.columns

In [None]:
# analisis dan cek korelasi antara price dengan kolom yang berhubungan dengan source
source_cols = ['price','source_Back Bay', 'source_Beacon Hill', 'source_Boston University','source_Fenway',
               'source_Financial District', 'source_Haymarket Square','source_North End', 'source_North Station',
               'source_Northeastern University', 'source_South Station','source_Theatre District',
               'source_West End']
new_data = df[source_cols]
new_data.head()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(new_data.corr(), annot=True, mask=np.triu(new_data.corr()))

kolom yang berhubungan dengan source memiliki pengaruh yang cukup signifikan terhadap price. kemudian cek kolom destinasi dengan cara yang sama untuk melihat rate korelasi terhadap price

In [None]:
destination_cols =['price','destination_Back Bay','destination_Beacon Hill', 'destination_Boston University',
                   'destination_Fenway', 'destination_Financial District','destination_Haymarket Square',
                   'destination_North End','destination_North Station', 'destination_Northeastern University',
                   'destination_South Station', 'destination_Theatre District','destination_West End']
new_data = df[destination_cols]
new_data.head()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(new_data.corr(), annot=True, mask=np.triu(new_data.corr()))

dari plotting heatmap diatas menunjukkan bahwa nilai korelasi dari source dan destination terhadap harga sangat rendah, jadi hapus mereka dan merestrukturisasi dataframe

In [None]:

drop_cols = ['source_Back Bay', 'source_Beacon Hill', 'source_Boston University',
       'source_Fenway', 'source_Financial District', 'source_Haymarket Square',
       'source_North End', 'source_North Station',
       'source_Northeastern University', 'source_South Station',
       'source_Theatre District', 'source_West End', 'destination_Back Bay',
       'destination_Beacon Hill', 'destination_Boston University',
       'destination_Fenway', 'destination_Financial District',
       'destination_Haymarket Square', 'destination_North End',
       'destination_North Station', 'destination_Northeastern University',
       'destination_South Station', 'destination_Theatre District',
       'destination_West End']
df = df.drop(drop_cols, axis=1)
print(df.shape)
df.head()

In [None]:
# cek korelasi dari kolom summary dengan price
summary_cols = ['price','short_summary_ Clear ','short_summary_ Drizzle ', 'short_summary_ Foggy ',
                'short_summary_ Light Rain ', 'short_summary_ Mostly Cloudy ','short_summary_ Overcast ',
                'short_summary_ Partly Cloudy ','short_summary_ Possible Drizzle ', 'short_summary_ Rain ']
new_data = df[summary_cols]
new_data.head()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(new_data.corr(),annot=True,mask=np.triu(new_data.corr()))

In [None]:
# cek korelasi antara kolom icon dan price
icon_cols= ['price','icon_ clear-day ', 'icon_ clear-night ', 'icon_ cloudy ', 'icon_ fog ',
       'icon_ partly-cloudy-day ', 'icon_ partly-cloudy-night ','icon_ rain ']
new_data = df[icon_cols]
plt.figure(figsize=(8,8))
sns.heatmap(new_data.corr(),annot=True,mask=np.triu(new_data.corr()))

dapat dilihat bahwa kolom summary dan kolom icon tidak ada pengaruh ke price karena nilai korelasi mereka terlalu rendah (hampir 0) jadi drop kolom kolom tersebut

In [None]:
drop_cols = ['short_summary_ Clear ','short_summary_ Drizzle ', 'short_summary_ Foggy ',
             'short_summary_ Light Rain ', 'short_summary_ Mostly Cloudy ','short_summary_ Overcast ',
             'short_summary_ Partly Cloudy ','short_summary_ Possible Drizzle ', 'short_summary_ Rain ',
             'icon_ clear-day ', 'icon_ clear-night ', 'icon_ cloudy ', 'icon_ fog ','icon_ partly-cloudy-day ',
             'icon_ partly-cloudy-night ','icon_ rain ']
df = df.drop(drop_cols,axis=1)
print(df.shape)
df.head()

In [None]:
df.columns

In [None]:
# analisis kolom nama dengan price
name_cols = ['price','name_Black', 'name_Black SUV','name_Lux', 'name_Lux Black', 'name_Lux Black XL', 'name_Lyft',
       'name_Lyft XL', 'name_Shared', 'name_UberPool','name_UberX', 'name_UberXL', 'name_WAV']
new_data = df[name_cols]
plt.figure(figsize=(10,8))
sns.heatmap(new_data.corr(),annot=True, mask=np.triu(new_data.corr()))

beberapa nilai dari kolom name memiliki pengaruh terhadap nilai price

In [None]:
df.columns

In [None]:
# analisis sisa kolom
remaining_cols = ['price','hour', 'day', 'month', 'distance', 'surge_multiplier','cab_type']
new_data = df[remaining_cols]
plt.figure(figsize=(8,8))
sns.heatmap(new_data.corr(),annot=True,mask=np.triu(new_data.corr()))

dari hasil analisis terhadap kolom kolom tersebut bisa dilihat bahwa fitur hour, day, month memiliki korelasi yang rendah. tetapi, kolom distance dan surge_multiplier memiliki korelasi yang bagus dengan price. jadi drop kolom kolom yang memiliki korelasi yang rendah

In [None]:
df = df.drop(['hour', 'day', 'month'], axis=1)
print(df.shape)
df.head()

In [None]:
# cek nilai null pada semua fitur
df.isnull().sum()

In [None]:
df.drop(columns=['index'], axis=1, inplace=True)

In [None]:
df.columns

In [None]:

df.rename(columns={'name_Black':'Uber Black','name_Black SUV':'Uber Black SUV', 'name_Lux':'Lyft Lux',
                   'name_Lux Black':'Lyft Lux Black','name_Lux Black XL':'Lyft Lux Black XL','name_Lyft':'Lyft',
                   'name_Lyft XL':'Lyft XL', 'name_Shared':'Lyft Shared', 'name_UberPool':'Uber Pool',
                   'name_UberX':'Uber X', 'name_UberXL':'Uber XL', 'name_WAV':'Uber WAV'}, inplace=True)

In [None]:
df.columns

In [None]:
df.drop(columns=['cab_type'], inplace=True)
df.head()

In [None]:
y = df['price']
y.head(3)

In [None]:
X = df.drop(columns=['price'], axis=1)
X.head()

## Modeling - splitting data train and data testing


In [None]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier

90:10

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=5)
model1_LR = LinearRegression()
model1 = model1_LR.fit(x_train, y_train)
ypred1 = model1.predict(x_test)

model1_RF = RandomForestRegressor(n_estimators = 500, random_state = 42)
model1_RF.fit(x_train, y_train)
predictions = model1_RF.predict(x_test)
ypred2 = model1_RF.predict(x_test)



In [None]:
print(':: LINEAR REGRESSION DATA ASLI 90:10 ::')
model_LR = LinearRegression()
model1 = model_LR.fit(x_train, y_train)
ypred1 = model1.predict(x_test)
lr_xtrain_r2 = model_LR.score(x_train, y_train)
lr_xtest_r2 = model_LR.score(x_test, y_test)
print('Linear Regression train R squared: %.4f' % lr_xtrain_r2)
print('Linear Regression test R squared: %.4f' % lr_xtest_r2)

lr_mse = mean_squared_error(ypred1, y_test)
lr_rmse = np.sqrt(lr_mse)
print('Linear Regression test RMSE: %.4f' % lr_rmse)

lr_mae = metrics.mean_absolute_error(y_test, ypred1)
lr_rootmae = np.sqrt(lr_mae)
print('Linear Regression test MAE: %.4f' % lr_rootmae)

errors = abs(ypred1 - y_test)
mape = 100 * (errors / y_test)
lr_accuracy = 100 - np.mean(mape)
print('Linear Regression Accuracy:', round(lr_accuracy, 2), '%.')

# Define x axis
x_axis = x_test.distance

# Build scatterplot
plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
plt.scatter(x_axis, ypred1, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')
plt.xlabel('x')
plt.ylabel('price')
plt.title("Difference in Predicted and Real Price")
plt.grid(color = '#D3D3D3', linestyle = 'solid')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
print(':: RANDOM FOREST DATA ASLI 90:10 ::')
model_RF = RandomForestRegressor(n_estimators = 500, random_state = 42)
model_RF.fit(x_train, y_train)
ypred2 = model_RF.predict(x_test)
rf_train_r2 = model_RF.score(x_train,y_train)
rf_test_r2 = model_RF.score(x_test, y_test)
print('Random Forest train R squared is: %.4f' % rf_train_r2)
print('Random Forest test R squared": %.4f' % rf_test_r2)

predictions = model_RF.predict(x_test)
RF_mse = mean_squared_error(predictions, y_test)
RF_rmse = np.sqrt(RF_mse)
print('Random Forest RMSE: %.4f' % RF_rmse)

RF_mae = metrics.mean_absolute_error(y_test, ypred2)
RF_rootmae = np.sqrt(RF_mae)
print('Random Forest test MAE: %.4f' % RF_rootmae)

errors = abs(predictions - y_test)
mape = 100 * (errors / y_test)
RF_accuracy = 100 - np.mean(mape)
print('Random Forest Accuracy:', round(RF_accuracy, 2), '%.')


# Define x axis
x_axis = x_test.distance

# Build scatterplot
plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
plt.scatter(x_axis, predictions, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')
plt.xlabel('distance')
plt.ylabel('price')
plt.title("Difference in Predicted and Real Price")
plt.grid(color = '#D3D3D3', linestyle = 'solid')
plt.legend(loc = 'lower right')
plt.show()

80:20

In [None]:

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)
model_LR = LinearRegression()
model1 = model_LR.fit(x_train, y_train)
ypred1 = model1.predict(x_test)

model_RF = RandomForestRegressor(n_estimators = 500, random_state = 42)
model_RF.fit(x_train, y_train)
predictions = model_RF.predict(x_test)
ypred2 = model_RF.predict(x_test)



In [None]:
print(':: LINEAR RERESSION DATA ASLI 80:20 ::')
model_LR = LinearRegression()
model1 = model_LR.fit(x_train, y_train)
ypred1 = model1.predict(x_test)
lr_xtrain_r2 = model_LR.score(x_train, y_train)
lr_xtest_r2 = model_LR.score(x_test, y_test)
print('Linear Regression train R squared: %.4f' % lr_xtrain_r2)
print('Linear Regression test R squared: %.4f' % lr_xtest_r2)

lr_mse = mean_squared_error(ypred1, y_test)
lr_rmse = np.sqrt(lr_mse)
print('Linear Regression test RMSE: %.4f' % lr_rmse)

lr_mae = metrics.mean_absolute_error(y_test, ypred1)
lr_rootmae = np.sqrt(lr_mae)
print('Linear Regression test MAE: %.4f' % lr_rootmae)

errors = abs(ypred1 - y_test)
mape = 100 * (errors / y_test)
lr_accuracy = 100 - np.mean(mape)
print('Linear Regression Accuracy:', round(lr_accuracy, 2), '%.')

# Define x axis
x_axis = x_test.distance

# Build scatterplot
plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
plt.scatter(x_axis, ypred1, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')
plt.xlabel('x')
plt.ylabel('price')
plt.title("Difference in Predicted and Real Price")
plt.grid(color = '#D3D3D3', linestyle = 'solid')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
print(':: RANDOM FOREST DATA ASLI 80:20 ::')
model_RF = RandomForestRegressor(n_estimators = 500, random_state = 42)
model_RF.fit(x_train, y_train)
ypred2 = model_RF.predict(x_test)
rf_train_r2 = model_RF.score(x_train,y_train)
rf_test_r2 = model_RF.score(x_test, y_test)
print('Random Forest train R squared is: %.4f' % rf_train_r2)
print('Random Forest test R squared": %.4f' % rf_test_r2)

predictions = model_RF.predict(x_test)
RF_mse = mean_squared_error(predictions, y_test)
RF_rmse = np.sqrt(RF_mse)
print('Random Forest RMSE: %.4f' % RF_rmse)

RF_mae = metrics.mean_absolute_error(y_test, ypred2)
RF_rootmae = np.sqrt(RF_mae)
print('Random Forest test MAE: %.4f' % RF_rootmae)

errors = abs(predictions - y_test)
mape = 100 * (errors / y_test)
RF_accuracy = 100 - np.mean(mape)
print('Random Forest Accuracy:', round(RF_accuracy, 2), '%.')

'''characteristics = lyft_X.columns
importances = list(regr.feature_importances_)
characteristics_importances = [(characteristic, round(importance, 2)) for characteristic, importance in zip(characteristics, importances)]
characteristics_importances = sorted(characteristics_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in characteristics_importances];'''

# Define x axis
x_axis = x_test.distance

# Build scatterplot
plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
plt.scatter(x_axis, predictions, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')
plt.xlabel('distance')
plt.ylabel('price')
plt.title("Difference in Predicted and Real Price")
plt.grid(color = '#D3D3D3', linestyle = 'solid')
plt.legend(loc = 'lower right')
plt.show()

60:40

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=5)
model_LR = LinearRegression()
model1 = model_LR.fit(x_train, y_train)
ypred1 = model1.predict(x_test)

model_RF = RandomForestRegressor(n_estimators = 500, random_state = 42)
model_RF.fit(x_train, y_train)
predictions = model_RF.predict(x_test)
ypred2 = model_RF.predict(x_test)

In [None]:
print(':: LINEAR REGRESSION DATA ASLI 60:40 ::')
model_LR = LinearRegression()
model1 = model_LR.fit(x_train, y_train)
ypred1 = model1.predict(x_test)
lr_xtrain_r2 = model_LR.score(x_train, y_train)
lr_xtest_r2 = model_LR.score(x_test, y_test)
print('Linear Regression train R squared: %.4f' % lr_xtrain_r2)
print('Linear Regression test R squared: %.4f' % lr_xtest_r2)

lr_mse = mean_squared_error(ypred1, y_test)
lr_rmse = np.sqrt(lr_mse)
print('Linear Regression test RMSE: %.4f' % lr_rmse)

lr_mae = metrics.mean_absolute_error(y_test, ypred1)
lr_rootmae = np.sqrt(lr_mae)
print('Linear Regression test MAE: %.4f' % lr_rootmae)

errors = abs(ypred1 - y_test)
mape = 100 * (errors / y_test)
lr_accuracy = 100 - np.mean(mape)
print('Linear Regression Accuracy:', round(lr_accuracy, 2), '%.')

# Define x axis
x_axis = x_test.distance

# Build scatterplot
plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
plt.scatter(x_axis, ypred1, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')
plt.xlabel('x')
plt.ylabel('price')
plt.title("Difference in Predicted and Real Price")
plt.grid(color = '#D3D3D3', linestyle = 'solid')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
print(':: RANDOM FOREST DATA ASLI 60:40 ::')
model_RF = RandomForestRegressor(n_estimators = 500, random_state = 42)
model_RF.fit(x_train, y_train)
ypred2 = model_RF.predict(x_test)
rf_train_r2 = model_RF.score(x_train,y_train)
rf_test_r2 = model_RF.score(x_test, y_test)
print('Random Forest train R squared is: %.4f' % rf_train_r2)
print('Random Forest test R squared": %.4f' % rf_test_r2)

predictions = model_RF.predict(x_test)
RF_mse = mean_squared_error(predictions, y_test)
RF_rmse = np.sqrt(RF_mse)
print('Random Forest RMSE: %.4f' % RF_rmse)

RF_mae = metrics.mean_absolute_error(y_test, ypred2)
RF_rootmae = np.sqrt(RF_mae)
print('Random Forest test MAE: %.4f' % RF_rootmae)

errors = abs(predictions - y_test)
mape = 100 * (errors / y_test)
RF_accuracy = 100 - np.mean(mape)
print('Random Forest Accuracy:', round(RF_accuracy, 2), '%.')

'''characteristics = lyft_X.columns
importances = list(regr.feature_importances_)
characteristics_importances = [(characteristic, round(importance, 2)) for characteristic, importance in zip(characteristics, importances)]
characteristics_importances = sorted(characteristics_importances, key = lambda x: x[1], reverse = True)
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in characteristics_importances];'''

# Define x axis
x_axis = x_test.distance

# Build scatterplot
plt.scatter(x_axis, y_test, c = 'b', alpha = 0.5, marker = '.', label = 'Real')
plt.scatter(x_axis, predictions, c = 'r', alpha = 0.5, marker = '.', label = 'Predicted')
plt.xlabel('distance')
plt.ylabel('price')
plt.title("Difference in Predicted and Real Price")
plt.grid(color = '#D3D3D3', linestyle = 'solid')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
model_data = {"Model": ["Linear Regression", "Random Forest",],
"Train R2": [lr_xtrain_r2, rf_train_r2],
"Test R2": [lr_xtest_r2, rf_test_r2],
"Test RMSE": [lr_rmse, RF_rmse],
"Test MAE" : [lr_rootmae, RF_rootmae],
"Accuracy": [lr_accuracy, RF_accuracy]}

summary_table = pd.DataFrame(model_data)
display(summary_table)

## Model Evaluation

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

cross_val = ShuffleSplit(n_splits=5, test_size=0.2, random_state=5)

cross_val_score(LinearRegression(), X, y, cv=cross_val)

In [None]:
cross_val_score( RandomForestRegressor(), X, y, cv=cross_val)

In [None]:
def predict_price(name_cab,distance,surge_multiplier):
    loc_index = np.where(X.columns==name_cab)[0][0]

    x = np.zeros(len(X.columns))
    x[0] = distance
    x[1] = surge_multiplier
    if loc_index >= 0:
        x[loc_index] = 1

    return model_RF.predict([x])[0]

In [None]:
predict_price('Lyft Lux',0.44,0.0)

In [None]:
predict_price('Lyft',0.44,0.0)

In [None]:
predict_price('Uber WAV',0.44,0.0)

In [None]:
predict_price('Lyft Shared',0.44, 0.0)

In [None]:
predict_price('Uber X',0.44, 0.0)

In [None]:
predict_price('Lyft Lux Black',0.44, 0.0)

In [None]:
predict_price('Lyft Lux Black XL',1.0, 0.0)

In [None]:
predict_price('Uber Pool',1.0, 0.0)

In [None]:
predict_price('Uber Black',1.0, 0.0)

In [None]:

predict_price('Uber Black SUV',1.5, 0.0)

In [None]:
# simpan model kedalam file dengan pickle
import pickle
pickle.dump(model, open('./predict_price_model.pkl','wb'))

In [None]:
X.columns

In [None]:
import json
columns = {
    'data_columns' : [col.lower() for col in X.columns]
}
with open("columns.json","w") as f:
    f.write(json.dumps(columns))

## Kesimpulan

Dataset ini memiliki dimensi yang tinggi yaitu 693071 × 57. dengan begitu banyaknya fitur perlu diketahui fitur mana saja yang memiliki korelasi yang cukup, hal itu sangat berguna ketika dalam proses prediksi. dikarenakan tujuan utama dalam projek ini adalah untuk memprediksi hargra maka fitur price merupakan variabel dependent yang akan menjadi predictand. Dalam proses pemilihan fitur dapat dilakukan dengan menggunakan fungsi correlation dan juga bantuan visualisasi dari heatmap plot. setelah dilakukan analisis dengan menggunakan fungsi korelasi dan heatmap plot  dari 57 fitur kami mengambil fitur distance, surge_multiplier dan name_cab karena fitur-fitur tersebut memiliki korelasi yang cukup berpengaruh ke variabel dependent(price). Pada section EDA bisa dilihat bahwa Top 5 source-destination pada cab jenis uber dan lyft adalah sama, yaitu : Financial District-South Station (dan sebaliknya), Back Bay-North End (dan sebaliknya), West End-Fenway. Transaksi berdasarkan nama cab pada cab jenis uber dan lyft memiliki hasil yang sama tetapi beda jumlah nilai, berdasarkan fitur short_summary jumlah transaksi tertinggi terjadi pada hari ketika mendung data dan transaksi terendah pada hari ketika mengalami grimis.Harga tertinggi pada cab jenis uber yaitu Black SUV dan level harga terendah adalah UberPool, sedangkan pada cab jenis Lyft dengan nama Lux Black XL memiliki level harga tertinggi dengan nilai diatas 30 sedangkan level harga terendah yaitu cab jenis Lyft Shared.
Untuk membuat model prediksi, pada projek ini menggunakan algoritma Linear Regression. Proses prediksi menggunakan  R2 score dengan memanfaatkan library scikit-learn untuk mempermudah proses. R2 score merupakan salah satu metode yang digunakan untuk mengukur performa evaluasi pada regression. Hasil prediksi diatas dapat dilihat bahwa model prediksi menghasilkan nilai sebesar 0.93 atau 93% yang mana hasil tersebut menunjukkan nilai prediksi yang baik.