In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

Seed = 42
np.random.seed(Seed)

# import sklearn libraries
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import xgboost

#### Build Functions

##### Calculate Time

In [None]:
def calculate_time(s_time):
    seconds = np.round(time.time() - s_time, 0)
    minutes = hours = 0

    # claculate minutes
    while seconds > 60:
        minutes += 1
        seconds -= 60
    
    # calculate hours
    while minutes > 60:
        hours += 1
        minutes -= 60

    print(f'Cell Executed in {hours}h {minutes}m {seconds}s')

##### Save scaled dataset 
we use it in the future rather than recleaning and saling the data

In [None]:
def save_data(data, labels, file_name, path='./'):
    df = pd.DataFrame(data, columns=labels)
    df.to_csv(f'{path}/{file_name}.csv', index=False )

##### Remove Nulls

In [None]:
def remove_nulls(df, cols):
    for col in cols:
        df[col] = df[col].fillna(df[col].mean())

##### Drop Features

In [None]:
# split the target and drop key and fare_amount from tain data
def drop_features(df, features= ['fare_amount', 'key']):
    df.drop(features, axis = 1, inplace=True)

#####  split datetime feature

In [None]:
# split datetime feature in dummy feature with [year, month, day, hour, minute, seconds]
def split_date_time(df, feature_name = 'pickup_datetime'):
    print('convert feature into datetime')
    date_feature = pd.to_datetime( df[feature_name])

    print('get year from feature:')
    year = date_feature.dt.year

    print('get Month from feature:')
    month = date_feature.dt.month

    print('get day from feature:')
    day = date_feature.dt.day

    print('get hour from feature:')
    hour = date_feature.dt.hour

    print('get minute from feature:')
    minute = date_feature.dt.minute

    print('get second from feature:')
    second = date_feature.dt.second

    print('add these cols as new features:')
    df['year'] = year
    df['month'] = month
    df['day'] = day
    df['hour'] = hour
    df['minute'] = minute
    df['second'] = second


#### Load the dataset

In [None]:
dir_path = '/kaggle/input/new-york-city-taxi-fare-prediction/'

# Load the dataset
train = pd.read_csv(f'{dir_path}train.csv', nrows=30_000_000)
test = pd.read_csv(f'{dir_path}test.csv')

In [None]:
features = ['pickup_datetime', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'passenger_count']

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Split the target from train
target = train['fare_amount']
target.head()

In [None]:
# remove key, fare_amount features
drop_features(train)
train.head()

### data Exploratory

#### Cleaning the Data

In [None]:
# detect Null values
train.isna().sum()

In [None]:
train=train[train['pickup_longitude'] > -75]
train=train[train['pickup_latitude'] < 42]
train=train[train['dropoff_longitude'] > -75]
train=train[train['dropoff_latitude'] < 42]

In [None]:
train.isna().sum()

In [None]:
# fill null values using the mean of the rest of the feature samples
null_features = ['dropoff_longitude', 'dropoff_latitude']
remove_nulls(train, null_features)
train.isnull().sum()

In [None]:
s_time = time.time()
# split datetime feature
split_date_time(train, feature_name='pickup_datetime')

drop_features(train, ['pickup_datetime'])

calculate_time(s_time)

train.head()

In [None]:
s_time = time.time()

# Split datetime feature in test set
split_date_time(test)

# remove key and pickup_datetime features
drop_features(test, ['key', 'pickup_datetime'])

calculate_time(s_time)

test.head()

In [None]:
drop_features(train, ['pickup_datetime'])

In [None]:
drop_features(test, ['key', 'pickup_datetime'])

##### Scale the features

In [None]:
s_time = time.time()

sc = StandardScaler()

print('Scale Training set')
train_scaled = sc.fit_transform(train)

print('Scale Testing set')
test_scaled = sc.transform(test)

calculate_time(s_time)

train_scaled[:5], test_scaled[:5]


##### Save Train and Test dataset to import directly in future without recleaning

In [None]:
labels = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'year', 'month', 'day', 'hour', 'minute', 'second']

# Save Train data
print('Saving Train File....')
save_data(train_scaled, labels, 'scaled_train')

print('Saving Test File....')
# save Test data
save_data(test_scaled, labels, 'scaled_test')

##### Split valid data from train with 20%

In [None]:

x_train, x_valid , y_train, y_valid = train_test_split(train_scaled, target, test_size=.2)

x_train.shape, y_train.shape, x_valid.shape, y_valid.shape


#### Build the Models

##### SVM Regressor

In [None]:
s_time = time.time()
svr_reg = SVR()
print('Fitting the model...')
svr_reg.fit(x_train, y_train)

print('Predicting the model...')
y_pred = svr_reg.predict(x_valid)

print('Calculating RMSE....')
svr_reg = mean_squared_error(y_valid, y_pred, squared=False)

print(f'RMSE = {svr_reg}')

calculate_time(s_time)

##### xgboost Regressor

In [None]:
s_time = time.time()

xg_reg = xgboost.XGBRegressor(n_estimators = 100, random_state = Seed)   

print('Fitting the model...')
xg_reg.fit(x_train, y_train)     

print('Predicting the model...')
y_pred = xg_reg.predict(x_valid)

print('Calculating RMSE....')
xg_mse = mean_squared_error(y_valid, y_pred, squared=False)

print(f'RMSE = {xg_mse}')

calculate_time(s_time)

In [None]:
# save predicted test data for submission
y_test_pred = xg_reg.predict(test_scaled)

submit = pd.read_csv('sample_submission.csv')

submit['fare_amount'] = y_test_pred

submit.to_csv('submission_xgboost.csv', index=False)

##### RandomForest Regressor

In [None]:
rnd_reg = RandomForestRegressor(n_estimators=200, max_depth=4, n_jobs=-1, random_state=Seed)

print('Fitting the model...')
rnd_reg.fit(x_train, y_train)

print('Predicting the model...')
y_pred_rnd = rnd_reg.predict(x_valid)

print('Calculating RMSE....')
mse = mean_squared_error(y_valid, y_pred_rnd, squared=False)

print(f'RMSE = {xg_mse}')

calculate_time(s_time)

In [None]:
# save predicted test data for submission
y_test_pred = rnd_reg.predict(test_scaled)

submit = pd.read_csv('sample_submission.csv')

submit['fare_amount'] = y_test_pred

submit.to_csv('submission_randomForest.csv', index=False)

In [None]:
y_test_pred

In [None]:
y_pred_rnd[:5], y_valid[:5]