In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('./datasets/Crucial_Elements.csv')
df = df.drop(df.index[0], axis=0)
cols = df.columns
df = df.rename(columns={'Unnamed: 0': 'Date_Time'})
df.head()

Unnamed: 0,Date_Time,total_amount,trip_distance,fare_amount,tolls_amount,tip_amount,Brooklyn_PU,Manhattan_PU,Queens_PU,Staten_Island_PU,Bronx_DO,Brooklyn_DO,Manhattan_DO,Queens_DO,Staten_Island_DO,Mins_In_Ride
1,2020-01-01 01:00:00,16.936198,2.564234,11.58183,0.000405,1.958635,0.043406,0.935325,0.01823,0.0,0.010851,0.064964,0.881357,0.042827,0.0,13.356097
2,2020-01-01 02:00:00,17.14878,2.690488,11.325833,0.001363,1.94059,0.053938,0.916782,0.025468,0.0,0.012166,0.083786,0.847757,0.05629,0.0,12.094755
3,2020-01-01 03:00:00,16.83025,2.791689,11.313673,0.0012,1.848462,0.043444,0.925123,0.027952,0.0,0.014369,0.096767,0.820947,0.067916,0.0,11.545077
4,2020-01-01 04:00:00,16.732711,2.921626,11.491224,0.001534,1.657045,0.045114,0.916712,0.036164,0.0,0.014429,0.109954,0.783196,0.09242,0.0,11.273529
5,2020-01-01 05:00:00,16.694981,3.021095,11.491293,0.001161,1.352062,0.044855,0.873351,0.077836,0.0,0.014952,0.112577,0.737027,0.135444,0.0,10.578918


In [3]:
date_vals = df['Date_Time']
test = np.array(date_vals.str.split('-| |:'))
year = list(map(lambda x: x[0], test))
month = list(map(lambda x: x[1], test))
day = list(map(lambda x: x[2], test))
hour = list(map(lambda x: x[3], test))
minute = list(map(lambda x: x[4], test))
second = list(map(lambda x: x[5], test))
time = pd.DataFrame({'Year': year, 'Month': month, 'Day': day,
                    'Hour': hour, 'Minute': minute, 'Second':second})
df = df.join(time)
df = df.drop('Date_Time', axis=1)
df.head()

Unnamed: 0,total_amount,trip_distance,fare_amount,tolls_amount,tip_amount,Brooklyn_PU,Manhattan_PU,Queens_PU,Staten_Island_PU,Bronx_DO,...,Manhattan_DO,Queens_DO,Staten_Island_DO,Mins_In_Ride,Year,Month,Day,Hour,Minute,Second
1,16.936198,2.564234,11.58183,0.000405,1.958635,0.043406,0.935325,0.01823,0.0,0.010851,...,0.881357,0.042827,0.0,13.356097,2020,1,1,2,0,0
2,17.14878,2.690488,11.325833,0.001363,1.94059,0.053938,0.916782,0.025468,0.0,0.012166,...,0.847757,0.05629,0.0,12.094755,2020,1,1,3,0,0
3,16.83025,2.791689,11.313673,0.0012,1.848462,0.043444,0.925123,0.027952,0.0,0.014369,...,0.820947,0.067916,0.0,11.545077,2020,1,1,4,0,0
4,16.732711,2.921626,11.491224,0.001534,1.657045,0.045114,0.916712,0.036164,0.0,0.014429,...,0.783196,0.09242,0.0,11.273529,2020,1,1,5,0,0
5,16.694981,3.021095,11.491293,0.001161,1.352062,0.044855,0.873351,0.077836,0.0,0.014952,...,0.737027,0.135444,0.0,10.578918,2020,1,1,6,0,0


# Standardizing the magnitude

In [4]:
scalar = StandardScaler()
indecies = df.index
X = df.drop('total_amount', axis=1)
y = df['total_amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=None)
# Makes a copy of the original Train and Test data
X_train.join(y_train).to_csv('./datasets/Train_data.csv')
X_test.join(y_test).to_csv('./datasets/Test_data.csv')

# Join X_train and y_train into one dataframe
train_index = X_train.index
test_index = X_test.index
training_data = X_train.join(y_train)
# Same with the testing data
testing_data = X_test.join(y_test)

# Fit on the training data 
scalar.fit(training_data)

# Transform on the training and testing data
training_data = scalar.transform(training_data)
testing_data = scalar.transform(testing_data)

# Turning it back into a dataframe to be able to write it into a file
training_data = pd.DataFrame(training_data, columns=df.columns, index=train_index)
testing_data = pd.DataFrame(testing_data, columns=df.columns, index=test_index)

In [5]:
training_data.to_csv('./datasets/Scaled_Train_Data.csv')
testing_data.to_csv('./datasets/Scaled_Test_Data.csv')