# Uber Data Analysis

## Loading the Data

In [1]:
#importing relevant libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
#loading the Data set
uber_data_df = pd.read_csv("UberDataset.csv", parse_dates=True)
uber_df = uber_data_df.copy()
uber_df.head()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
0,01-01-2016 21:11,01-01-2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,01-02-2016 01:25,01-02-2016 01:37,Business,Fort Pierce,Fort Pierce,5.0,
2,01-02-2016 20:25,01-02-2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,01-05-2016 17:31,01-05-2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,01-06-2016 14:42,01-06-2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit


## Understanding the Data

In [3]:
# Determining the no. of records in our dataset
uber_df.shape

(1155, 7)

In [4]:
# Previewing the bottom of our dataset
uber_df.tail()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
1150,12/31/2016 1:07,12/31/2016 1:14,Business,Kar?chi,Kar?chi,0.7,Meeting
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site


In [5]:
# Exploring the descriptive statistics of the variables
uber_df.describe(include='all')

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
count,1155,1155,1155,1155,1155,1155.0,653
unique,1154,1154,2,177,188,,10
top,6/28/2016 23:34,6/28/2016 23:59,Business,Cary,Cary,,Meeting
freq,2,2,1078,201,203,,187
mean,,,,,,10.56684,
std,,,,,,21.579106,
min,,,,,,0.5,
25%,,,,,,2.9,
50%,,,,,,6.0,
75%,,,,,,10.4,


In [6]:
# Checking for data set info
uber_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155 entries, 0 to 1154
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   START_DATE  1155 non-null   object 
 1   END_DATE    1155 non-null   object 
 2   CATEGORY    1155 non-null   object 
 3   START       1155 non-null   object 
 4   STOP        1155 non-null   object 
 5   MILES       1155 non-null   float64
 6   PURPOSE     653 non-null    object 
dtypes: float64(1), object(6)
memory usage: 63.3+ KB


In [7]:
# Converting data types
uber_df['START_DATE'] = pd.to_datetime(uber_df['START_DATE'])
uber_df['END_DATE'] = pd.to_datetime(uber_df['END_DATE'])

#uber_df['CATEGORY'] = uber_df['CATEGORY'].astype('str')

In [8]:
uber_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155 entries, 0 to 1154
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   START_DATE  1155 non-null   datetime64[ns]
 1   END_DATE    1155 non-null   datetime64[ns]
 2   CATEGORY    1155 non-null   object        
 3   START       1155 non-null   object        
 4   STOP        1155 non-null   object        
 5   MILES       1155 non-null   float64       
 6   PURPOSE     653 non-null    object        
dtypes: datetime64[ns](2), float64(1), object(4)
memory usage: 63.3+ KB


In [9]:
# Checking for duplicates
uber_df[uber_df.duplicated()]

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE
492,2016-06-28 23:34:00,2016-06-28 23:59:00,Business,Durham,Cary,9.9,Meeting


In [10]:
# Drop duplicated row
uber_df.drop_duplicates(inplace=True)

In [11]:
# Check for nulls
uber_df.isnull().sum()

START_DATE      0
END_DATE        0
CATEGORY        0
START           0
STOP            0
MILES           0
PURPOSE       502
dtype: int64

In [12]:
# Fill 'UNKOWN' for [Purpose]
uber_df["PURPOSE"] = uber_df["PURPOSE"].fillna("Missing")
uber_df.dropna(inplace = True)

In [13]:
uber_df.isnull().sum()

START_DATE    0
END_DATE      0
CATEGORY      0
START         0
STOP          0
MILES         0
PURPOSE       0
dtype: int64

# Feature Engineering

In [14]:
# creating new features
uber_df["YEAR"] = uber_df['START_DATE'].dt.year
uber_df["MONTH"] = uber_df['START_DATE'].dt.month_name()
uber_df["DAY"] = uber_df['START_DATE'].dt.day_name()
uber_df["DURATION"] = (uber_df['END_DATE'] - uber_df['START_DATE']).astype('timedelta64[m]')

time_periods = [0,6,12,18,24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
uber_df['TimeOfDay'] = pd.cut(uber_df['START_DATE'].dt.hour, bins = time_periods, labels = labels, right = False)


In [15]:
uber_df.head()

Unnamed: 0,START_DATE,END_DATE,CATEGORY,START,STOP,MILES,PURPOSE,YEAR,MONTH,DAY,DURATION,TimeOfDay
0,2016-01-01 21:11:00,2016-01-01 21:17:00,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain,2016,January,Friday,6.0,Evening
1,2016-01-02 01:25:00,2016-01-02 01:37:00,Business,Fort Pierce,Fort Pierce,5.0,Missing,2016,January,Saturday,12.0,Night
2,2016-01-02 20:25:00,2016-01-02 20:38:00,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies,2016,January,Saturday,13.0,Evening
3,2016-01-05 17:31:00,2016-01-05 17:45:00,Business,Fort Pierce,Fort Pierce,4.7,Meeting,2016,January,Tuesday,14.0,Afternoon
4,2016-01-06 14:42:00,2016-01-06 15:49:00,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit,2016,January,Wednesday,67.0,Afternoon
