In [0]:
#Import libraries
import pandas as pd                 # package for data frame analysis
import matplotlib.pyplot as plt     # package for graphical plot
import seaborn as sns               # package for graphical plot
import numpy as np                  # package for mathematical calculation
from scipy import stats             # package for statistical calculation
import io                           # pakcage for input/output 
from google.colab import files      # package for Google drive manipulation
import geopy.distance

import warnings                     # disable warning notification (optional)
warnings.filterwarnings('ignore')
%matplotlib inline

In [0]:
#Mount the notebook into Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#Read file train.csv into df
path = "/content/drive/My Drive/Data/train.csv"
df = pd.read_csv(path)

In [3]:
#Check if the dataset is correctly stored in df
df.head(10)

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1
5,2011-01-06 09:50:45.0000002,12.1,2011-01-06 09:50:45 UTC,-74.000964,40.73163,-73.972892,40.758233,1
6,2012-11-20 20:35:00.0000001,7.5,2012-11-20 20:35:00 UTC,-73.980002,40.751662,-73.973802,40.764842,1
7,2012-01-04 17:22:00.00000081,16.5,2012-01-04 17:22:00 UTC,-73.9513,40.774138,-73.990095,40.751048,1
8,2012-12-03 13:10:00.000000125,9.0,2012-12-03 13:10:00 UTC,-74.006462,40.726713,-73.993078,40.731628,1
9,2009-09-02 01:11:00.00000083,8.9,2009-09-02 01:11:00 UTC,-73.980658,40.733873,-73.99154,40.758138,2


In [4]:
df.shape

(55423856, 8)

In [6]:
df.memory_usage().sum()/1024

3463991.125

In [0]:
#Check datatypes and data information
df.dtypes

key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [0]:
df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,55423860.0,55423860.0,55423860.0,55423480.0,55423480.0,55423860.0
mean,11.34505,-72.50968,39.91979,-72.51121,39.92068,1.68538
std,20.71083,12.84888,9.642353,12.7822,9.633346,1.327664
min,-300.0,-3442.06,-3492.264,-3442.025,-3547.887,0.0
25%,6.0,-73.99207,40.73493,-73.9914,40.73403,1.0
50%,8.5,-73.9818,40.75265,-73.98015,40.75316,1.0
75%,12.5,-73.96708,40.76713,-73.96367,40.7681,2.0
max,93963.36,3457.626,3408.79,3457.622,3537.133,208.0


In [0]:
#Check null value
df.isnull().sum()

key                    0
fare_amount            0
pickup_datetime        0
pickup_longitude       0
pickup_latitude        0
dropoff_longitude    376
dropoff_latitude     376
passenger_count        0
dtype: int64

In [0]:
#Remove 'pickup_datetime' and null records then re-check
df = df.drop(index = df[df.dropoff_longitude.isnull()].index)
df = df.drop(columns = 'pickup_datetime')

In [0]:
df.isnull().sum()

key                  0
fare_amount          0
pickup_datetime      0
pickup_longitude     0
pickup_latitude      0
dropoff_longitude    0
dropoff_latitude     0
passenger_count      0
dtype: int64

In [0]:
# Function for calculating the distance
def Haversine(lat1,lon1,lat2,lon2, **kwarg):
  R = 6371.0088
  lat1,lon1,lat2,lon2 = map(np.radians, [lat1,lon1,lat2,lon2])

  dlat = lat2 - lat1
  dlon = lon2 - lon1
  a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2) **2
  c = 2 * np.arctan2(a**0.5, (1-a)**0.5)
  d = R * c
  return round(d,6)

In [0]:
#Calculating the feature 'distance' and check
df['distance'] = Haversine(df['pickup_latitude'],df['pickup_longitude'],df['dropoff_latitude'],df['dropoff_longitude'])

In [0]:
df.head()

Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,year,distance
0,2009-06-15 17:26:21.000000100,4.5,-73.844311,40.721319,-73.84161,40.712278,1,17,2009,1.030765
1,2010-01-05 16:52:16.000000200,16.9,-74.016048,40.711303,-73.979268,40.782004,1,16,2010,8.450145
2,2011-08-18 00:35:00.000000490,5.7,-73.982738,40.76127,-73.991242,40.750562,2,0,2011,1.389527
3,2012-04-21 04:30:42.000000100,7.7,-73.98713,40.733143,-73.991567,40.758092,1,4,2012,2.799274
4,2010-03-09 07:51:00.000000135,5.3,-73.968095,40.768008,-73.956655,40.783762,1,7,2010,1.99916


In [0]:
#Convert key to datetime to extract for information
from datetime import datetime
df['key']=pd.to_datetime(df['key'])
df['key'].head()

0   2009-06-15 17:26:21.000000100
1   2010-01-05 16:52:16.000000200
2   2011-08-18 00:35:00.000000490
3   2012-04-21 04:30:42.000000100
4   2010-03-09 07:51:00.000000135
Name: key, dtype: datetime64[ns]

In [0]:
# Extract the hour and the year into new features
df['hour']=pd.DatetimeIndex(df['key']).hour
df['year']=pd.DatetimeIndex(df['key']).year
df.head()

Unnamed: 0,key,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,year
0,2009-06-15 17:26:21.000000100,4.5,-73.844311,40.721319,-73.84161,40.712278,1,17,2009
1,2010-01-05 16:52:16.000000200,16.9,-74.016048,40.711303,-73.979268,40.782004,1,16,2010
2,2011-08-18 00:35:00.000000490,5.7,-73.982738,40.76127,-73.991242,40.750562,2,0,2011
3,2012-04-21 04:30:42.000000100,7.7,-73.98713,40.733143,-73.991567,40.758092,1,4,2012
4,2010-03-09 07:51:00.000000135,5.3,-73.968095,40.768008,-73.956655,40.783762,1,7,2010


In [0]:
# Store the data into new csv
df.to_csv("/content/drive/My Drive/Data/trainmod2.csv")

# New Section