# Importing the required set of libraries !!

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.linear_model import LinearRegression
import time

# Reading the data

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
# viewing dataframe shape
train_df.shape

(1458644, 11)

In [4]:
# viewing first few rows of df
train_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


# Description of Datafield in dataset
* id - a unique identifier for each trip
* vendor_id - a code indicating the provider associated with the trip record
* pickup_datetime - date and time when the meter was engaged
* dropoff_datetime - date and time when the meter was disengaged
* passenger_count - the number of passengers in the vehicle (driver entered value)
* pickup_longitude - the longitude where the meter was engaged
* pickup_latitude - the latitude where the meter was engaged
* dropoff_longitude - the longitude where the meter was disengaged
* dropoff_latitude - the latitude where the meter was disengaged
* store_and_fwd_flag - This flag indicates whether the trip record was held in vehicle memory before sending to the vendor   
  because the vehicle did not have a connection to the server - Y=store and forward; N=not a store and forward trip
* trip_duration - duration of the trip in seconds

In [5]:
#list of columns
train_df.columns

Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [6]:
# Viewing column data types
train_df.dtypes

id                     object
vendor_id               int64
pickup_datetime        object
dropoff_datetime       object
passenger_count         int64
pickup_longitude      float64
pickup_latitude       float64
dropoff_longitude     float64
dropoff_latitude      float64
store_and_fwd_flag     object
trip_duration           int64
dtype: object

In [7]:
# describing the columns
train_df.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


# Removing unwanted columns from the dataframe

In [8]:
fields_to_drop = ['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime']
train_df.drop(fields_to_drop, axis=1,inplace=True)

In [9]:
#viewing the new df
train_df.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [10]:
# new dataframe column list
train_df.columns

Index(['passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')

In [11]:
# Encoding/Mapping store_and_fwd_flag column data from 'Y' & 'N' to 1 and 0 respectively:
train_df['store_and_fwd_flag'] = train_df['store_and_fwd_flag'].map({'Y': 1, 'N': 0})

In [12]:
# Updated Dataframe:
train_df.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,1,-73.982155,40.767937,-73.96463,40.765602,0,455
1,1,-73.980415,40.738564,-73.999481,40.731152,0,663
2,1,-73.979027,40.763939,-74.005333,40.710087,0,2124
3,1,-74.01004,40.719971,-74.012268,40.706718,0,429
4,1,-73.973053,40.793209,-73.972923,40.78252,0,435


# Converting the values of data frame into numpy array to make it work better with the model

In [13]:
# numpy array
array = train_df.values

In [14]:
# numpy array shape
array.shape

(1458644, 7)

# Separating data into features and target components

In [15]:
x_train = array[:,0:6]
y_train = array[:,6]

In [16]:
print(x_train.shape)
print(y_train.shape)
print(type(x_train))
print(type(y_train))

(1458644, 6)
(1458644,)
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


# Rescaling Data using MinMaxScaler between 0 and 1

In [17]:
scaler = MinMaxScaler(feature_range=(0,1))
rescaledX = scaler.fit_transform(x_train)

In [18]:
print(rescaledX[:9])

[[0.11111111 0.79130228 0.3657382  0.79159134 0.73122178 0.        ]
 [0.11111111 0.79133098 0.36406178 0.79101622 0.72828729 0.        ]
 [0.11111111 0.7913539  0.36551003 0.79091965 0.726493   0.        ]
 [0.11111111 0.7908421  0.36300063 0.79080521 0.72620608 0.        ]
 [0.11111111 0.79145248 0.36718057 0.79145449 0.73266286 0.        ]
 [0.66666667 0.79129069 0.36426905 0.79113835 0.72982326 0.        ]
 [0.44444444 0.79151908 0.3651619  0.79171057 0.7312468  0.        ]
 [0.11111111 0.7915148  0.3674414  0.79228708 0.73079221 0.        ]
 [0.11111111 0.79101635 0.36405242 0.79124222 0.72842896 0.        ]]


# Standarding the data using StandardScaler (0 mean, 1 stdev)

In [19]:
std_scaler = StandardScaler().fit(rescaledX)
standardizedX = std_scaler.transform(rescaledX)

# Normalizing the data using Normalizer
#### Normalizing in scikit-learn refers to rescaling each observation (row) to have a length of 1 (called a unit norm or a vector with the length of 1 in linear algebra).

In [20]:
norm_scaler = Normalizer().fit(standardizedX)
normalizedX = norm_scaler.transform(standardizedX)

In [21]:
print(normalizedX[0:5])

[[-0.60121343 -0.14537114  0.61531095  0.14787723  0.45726759 -0.08854805]
 [-0.53928141 -0.10422982 -0.40082534 -0.39352085 -0.61358253 -0.07942656]
 [-0.35943615 -0.05554864  0.28143591 -0.32116939 -0.82617284 -0.05293855]
 [-0.27863134 -0.28409795 -0.51868911 -0.30306446 -0.69215779 -0.04103744]
 [-0.31072816  0.00375577  0.79033847  0.00428574  0.52600973 -0.04576473]]


# Creating linear regression model object 

In [22]:
linreg = LinearRegression()

In [23]:
# Train the model using the training sets
start_time = time.time()
linreg.fit(normalizedX, y_train)
print("Time taken to train the model: {0} seconds".format(time.time() - start_time))

Time taken to train the model: 3.646364212036133 seconds


# Function to perform the above tasks to prepare data for testing the model

In [24]:
def prepare_test_data(data):
    fields_to_drop = ['id', 'vendor_id', 'pickup_datetime']
    data.drop(fields_to_drop, axis=1,inplace=True)
    data['store_and_fwd_flag'] = data['store_and_fwd_flag'].map({'Y': 1, 'N': 0})
    array = data.values
    X = array[:,0:6]
    
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(0,1))
    rescaledX = scaler.fit_transform(X)

    from sklearn.preprocessing import StandardScaler
    std_scaler = StandardScaler().fit(rescaledX)
    standardizedX = std_scaler.transform(rescaledX)

    from sklearn.preprocessing import Normalizer
    norm_scaler = Normalizer().fit(standardizedX)
    normalizedX = norm_scaler.transform(standardizedX)
    
    return normalizedX

In [25]:
x_test = prepare_test_data(test_df)

# Making predictions using the testing set

In [26]:
y_test = linreg.predict(x_test)

In [27]:
print('Coefficients: \n', linreg.coef_)

Coefficients: 
 [  79.23224744  803.86679351 -320.8432865   391.29797885 -193.50972533
  400.72556269]


In [28]:
print(len(y_test))
print(y_test[:10])

625134
[ 934.86327628 1479.10705546  960.04265103 1043.83981884  938.21584609
  673.07850427 1336.16516267 1196.11627718 1061.09054555  562.03547077]


# Preparing the submission file

In [29]:
# reading the file
submission_file = pd.read_csv('sample_submission.csv')

In [30]:
submission_file.head()

Unnamed: 0,id,trip_duration
0,id3004672,959
1,id3505355,959
2,id1217141,959
3,id2150126,959
4,id1598245,959


In [31]:
submission_file['trip_duration'] = y_test

In [32]:
submission_file.head(10)

Unnamed: 0,id,trip_duration
0,id3004672,934.863276
1,id3505355,1479.107055
2,id1217141,960.042651
3,id2150126,1043.839819
4,id1598245,938.215846
5,id0668992,673.078504
6,id1765014,1336.165163
7,id0898117,1196.116277
8,id3905224,1061.090546
9,id1543102,562.035471


In [33]:
submission_file.dtypes

id                object
trip_duration    float64
dtype: object

# Saving the solution file

In [34]:
submission_file.to_csv('MKY_NYCTT_Solution.csv', index=False)