## Reading CSV files with NumPy

In [1]:
import numpy as np

In [5]:
taxi = np.genfromtxt('nyc_taxis.csv', delimiter=',')
taxi

array([[      nan,       nan,       nan, ...,       nan,       nan,
              nan],
       [2.016e+03, 1.000e+00, 1.000e+00, ..., 1.165e+01, 6.999e+01,
        1.000e+00],
       [2.016e+03, 1.000e+00, 1.000e+00, ..., 8.000e+00, 5.430e+01,
        1.000e+00],
       ...,
       [2.016e+03, 6.000e+00, 3.000e+01, ..., 5.000e+00, 6.334e+01,
        1.000e+00],
       [2.016e+03, 6.000e+00, 3.000e+01, ..., 8.950e+00, 4.475e+01,
        1.000e+00],
       [2.016e+03, 6.000e+00, 3.000e+01, ..., 0.000e+00, 5.484e+01,
        2.000e+00]])

In [4]:
taxi_shape = taxi.shape
taxi_shape

(89561, 15)

## Reading CSV files with NumPy Continued

In [6]:
taxi = np.genfromtxt('nyc_taxis.csv', delimiter=',', skip_header=1)
taxi_shape = taxi.shape
taxi_shape

(89560, 15)

## Boolean Arrays

In [7]:
a = np.array([1, 2, 3, 4, 5])
b = np.array(["blue", "blue", "red", "blue"])
c = np.array([80.0, 103.4, 96.9, 200.3])

a_bool = a < 3
b_bool = b == 'blue'
c_bool = c > 100

## Boolean Indexing with 1D ndarrays

In [13]:
pickup_month = taxi[:,1]

february_bool = pickup_month == 2
february = pickup_month[february_bool]
february_rides = february.shape[0]

In [14]:
february_rides

13333

## Boolean Indexing with 2D ndarrays

![Jupyter](./bool_dims_updated.svg)

In [22]:
tip_amount = taxi[:,12]

tip_bool = tip_amount > 50

top_tips = taxi[tip_bool][:,5:14]

In [23]:
top_tips.shape

(16, 9)

## Assigning Values in ndarrays

In [24]:
taxi_modified = taxi.copy()

In [26]:
taxi_modified[28214, 5] = 1

In [28]:
taxi_modified[:,0] = 16

In [29]:
taxi_modified[1800:1802, 7] = np.mean(taxi_modified[:,7])

## Assignment Using Boolean Arrays

In [32]:
taxi_copy = taxi.copy()
total_amount = taxi_copy[:,13]
total_amount[total_amount < 0] = 0

## Assignment Using Boolean Arrays Continued

In [46]:
zeros = np.zeros([taxi.shape[0], 1])
taxi_modified = np.concatenate([taxi, zeros], axis=1)
print(taxi_modified)

[[2.016e+03 1.000e+00 1.000e+00 ... 6.999e+01 1.000e+00 0.000e+00]
 [2.016e+03 1.000e+00 1.000e+00 ... 5.430e+01 1.000e+00 0.000e+00]
 [2.016e+03 1.000e+00 1.000e+00 ... 3.780e+01 2.000e+00 0.000e+00]
 ...
 [2.016e+03 6.000e+00 3.000e+01 ... 6.334e+01 1.000e+00 0.000e+00]
 [2.016e+03 6.000e+00 3.000e+01 ... 4.475e+01 1.000e+00 0.000e+00]
 [2.016e+03 6.000e+00 3.000e+01 ... 5.484e+01 2.000e+00 0.000e+00]]


In [59]:
# ariport_bool = (taxi_modified[:,5] == 2)|(taxi_modified[:,5] == 3)|(taxi_modified[:,5] == 5)
# # taxi_modified[ariport_bool][:,15] = 1
# sum(ariport_bool)

63552

In [56]:
taxi_modified[taxi_modified[:, 5] == 2, 15] = 1
taxi_modified[taxi_modified[:, 5] == 3, 15] = 1
taxi_modified[taxi_modified[:, 5] == 5, 15] = 1

In [57]:
ariport_bool.shape

(89560,)

In [58]:
sum(taxi_modified[:,15])

63552.0

## Challenge: Which is the most popular airport?

In [62]:
jfk = taxi[:,6] == 2
jfk_count = sum(jfk)
jfk_count

11832

In [63]:
laguardia = taxi[:,6] == 3
laguardia_count = sum(laguardia)
laguardia_count

16602

In [64]:
newark = taxi[:,6] == 5
newark_count = sum(newark)
newark_count

63

## Challenge: Calculating Statistics for Trips on Clean Data

In [65]:
trip_mph = taxi[:,7] / (taxi[:,8] / 3600)

cleaned_taxi = taxi[trip_mph < 100]

In [66]:
mean_distance = np.mean(cleaned_taxi[:,7])
mean_distance

12.666396599932893

In [67]:
mean_length = np.mean(cleaned_taxi[:,8])
mean_length

2239.503657309026

In [68]:
mean_total_amount = np.mean(cleaned_taxi[:,13])
mean_total_amount

48.98131853260262