In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [12]:
df = pd.read_csv('./datasets/Crucial_Elements.csv')
df = df.drop(df.index[0], axis=0)
cols = df.columns
df = df.rename(columns={'Unnamed: 0': 'Date_Time'})
df.head()

Unnamed: 0,Date_Time,avg_VendorID_hour,avg_passenger_count_hour,avg_trip_distance_hour,avg_RatecodeID_hour,avg_fare_amount_hour,avg_extra_hour,avg_mta_tax_hour,avg_tip_amount_hour,avg_tolls_amount_hour,...,avg_Queens_PU_isna_hour,avg_Bronx_PU_isna_hour,avg_Brooklyn_PU_isna_hour,avg_Staten_Island_PU_isna_hour,avg_Manhattan_DO_isna_hour,avg_Queens_DO_isna_hour,avg_Bronx_DO_isna_hour,avg_Brooklyn_DO_isna_hour,avg_Staten_Island_DO_isna_hour,avg_Mins_In_Ride_hour
1,2020-01-01 01:00:00,1.697895,1.683354,2.564234,0.99877,11.58183,1.182341,0.5,1.958635,0.000405,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,13.356097
2,2020-01-01 02:00:00,1.688215,1.647092,2.690488,0.99854,11.325833,1.195151,0.5,1.94059,0.001363,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.094755
3,2020-01-01 03:00:00,1.698586,1.630557,2.791689,0.999326,11.313673,1.182027,0.5,1.848462,0.0012,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.545077
4,2020-01-01 04:00:00,1.712146,1.637443,2.921626,0.999817,11.491224,1.137763,0.5,1.657045,0.001534,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,11.273529
5,2020-01-01 05:00:00,1.6781,1.583993,3.021095,1.0,11.491293,1.180739,0.5,1.352062,0.001161,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.578918


In [13]:
df.shape

(4367, 38)

In [14]:
date_vals = df['Date_Time']
test = np.array(date_vals.str.split('-| |:'))
year = list(map(lambda x: x[0], test))
month = list(map(lambda x: x[1], test))
day = list(map(lambda x: x[2], test))
hour = list(map(lambda x: x[3], test))
minute = list(map(lambda x: x[4], test))
second = list(map(lambda x: x[5], test))
time = pd.DataFrame({'Year': year, 'Month': month, 'Day': day,
                    'Hour': hour, 'Minute': minute, 'Second':second})
df = df.join(time)
df = df.drop('Date_Time', axis=1)
df.head()

Unnamed: 0,avg_VendorID_hour,avg_passenger_count_hour,avg_trip_distance_hour,avg_RatecodeID_hour,avg_fare_amount_hour,avg_extra_hour,avg_mta_tax_hour,avg_tip_amount_hour,avg_tolls_amount_hour,avg_improvement_surcharge_hour,...,avg_Bronx_DO_isna_hour,avg_Brooklyn_DO_isna_hour,avg_Staten_Island_DO_isna_hour,avg_Mins_In_Ride_hour,Year,Month,Day,Hour,Minute,Second
1,1.697895,1.683354,2.564234,0.99877,11.58183,1.182341,0.5,1.958635,0.000405,0.3,...,0.0,0.0,0.0,13.356097,2020,1,1,2,0,0
2,1.688215,1.647092,2.690488,0.99854,11.325833,1.195151,0.5,1.94059,0.001363,0.3,...,0.0,0.0,0.0,12.094755,2020,1,1,3,0,0
3,1.698586,1.630557,2.791689,0.999326,11.313673,1.182027,0.5,1.848462,0.0012,0.3,...,0.0,0.0,0.0,11.545077,2020,1,1,4,0,0
4,1.712146,1.637443,2.921626,0.999817,11.491224,1.137763,0.5,1.657045,0.001534,0.3,...,0.0,0.0,0.0,11.273529,2020,1,1,5,0,0
5,1.6781,1.583993,3.021095,1.0,11.491293,1.180739,0.5,1.352062,0.001161,0.3,...,0.0,0.0,0.0,10.578918,2020,1,1,6,0,0


# Standardizing the magnitude

In [15]:
scalar = StandardScaler()
indecies = df.index
X = df.drop('avg_total_amount_hour', axis=1)
y = df['avg_total_amount_hour']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=None)
# Makes a copy of the original Train and Test data
X_train.join(y_train).to_csv('./datasets/Train_data.csv')
X_test.join(y_test).to_csv('./datasets/Test_data.csv')

# Join X_train and y_train into one dataframe
train_index = X_train.index
test_index = X_test.index
training_data = X_train.join(y_train)
# Same with the testing data
testing_data = X_test.join(y_test)

# Fit on the training data 
scalar.fit(training_data)

# Transform on the training and testing data
training_data = scalar.transform(training_data)
testing_data = scalar.transform(testing_data)

# Turning it back into a dataframe to be able to write it into a file
training_data = pd.DataFrame(training_data, columns=df.columns, index=train_index)
testing_data = pd.DataFrame(testing_data, columns=df.columns, index=test_index)

In [16]:
training_data.to_csv('./datasets/Scaled_Train_Data.csv')
testing_data.to_csv('./datasets/Scaled_Test_Data.csv')