# An introduction to using NumPy for fast data manipulation

This Dataset contains around 90,000 yellow taxi trips taken to and from New York City airports between January and June 2016.
 


In [1]:
# importing needed libraries 
import csv
import numpy as np
#import pandas as pd

In [2]:
o_file = open("nyc_taxis.csv", "r") # open for reading (default)
r_file = csv.reader(o_file)
dataset = list(r_file)
dataset_f = dataset
dataset_h = dataset[0] # grab header
dataset = dataset[1:] # remove header

In [3]:
# How is the data labeled?
print(dataset_h)

['pickup_year', 'pickup_month', 'pickup_day', 'pickup_dayofweek', 'pickup_time', 'pickup_location_code', 'dropoff_location_code', 'trip_distance', 'trip_length', 'fare_amount', 'fees_amount', 'tolls_amount', 'tip_amount', 'total_amount', 'payment_type']


## Column Meanings

pickup_year: the year of the trip

pickup_month: the month of the trip (January is 1, December is 12)

pickup_day: the day of the month of the trip

pickup_location_code: the airport or borough where the trip started

dropoff_location_code: the airport or borough where the trip ended

trip_distance: the distance of the trip in miles

trip_length: the length of the trip in seconds

fare_amount: the base fare of the trip, in dollars

total_amount: the total amount charged to the passenger, including all 
fees, tolls and tips


## Changing all values to floats

Easier Data Cleaning/Manipulation

In [4]:
dataset_floats = [] # dataset with all datas as floats 

# converts to floats
for data in dataset: #grab each row
    data_rows = [] #hold row item values
    for item in data:
        data_rows.append(float(item)) # convert each item to float
    dataset_floats.append(data_rows) # append new values to new list

## Using NumPy to convert lists to ndarrays

In [5]:
# dtype for better formating
taxi_array = np.array(dataset_floats, dtype=object) 

# check first row
print("First Row:\n",taxi_array[0], "\n")

# gathering info
taxi_shape = taxi_array.shape
print("This Array has {} rows and {} columns".format(taxi_shape[0], taxi_shape[1])) #2013 rows, 15 cols


First Row:
 [2016.0 1.0 1.0 5.0 0.0 2.0 4.0 21.0 2037.0 52.0 0.8 5.54 11.65 69.99 1.0] 

This Array has 2013 rows and 15 columns


## What kind of information can we gather from this dataset?


1. We can find the average travel speed using the trip distance, length and vector math

In [6]:
# speed = distance / time

print("Cols:",dataset_h[7], dataset_h[8], "\n")
print("Trip Distance in miles: ", taxi_array[0][7], "\n") # example trip distance
print("Trip Length in seconds: ", taxi_array[0][8], "\n") # example trip length

# convert seconds to hours, 3600 seconds in an hour
trip_length_hours = taxi_array[:,8] / 3600


taxi_mph = taxi_array[:,7] / trip_length_hours

# check output
print(taxi_mph)

Cols: trip_distance trip_length 

Trip Distance in miles:  21.0 

Trip Length in seconds:  2037.0 

[37.11340206185567 38.58157894736842 31.27222982216142 ...
 22.299078667611624 42.41551246537396 36.904734073641144]


## Using ndarray.min and .max
Now that we have the mph we can gather more information such as the max, min, and mean mph

In [24]:
taxi_max = taxi_mph.max()
taxi_min = taxi_mph.min()
taxi_mean = taxi_mph.mean()

"The slowest taxi ride was {} mph, the fastest was {} mph, and the average mph of a trip was {}".format(
taxi_min, taxi_max, taxi_mean)

'The slowest taxi ride was 0.0 mph, the fastest was 82800.0 mph, and the average mph of a trip was 169.98315083655177'