Boolean Indexing with NumPy

In [1]:
import numpy as np

In [4]:
#read csv file into Numpy
taxi =np.genfromtxt("nyc_taxis.csv", delimiter = ',')
taxi_shape = taxi.shape

In [5]:
#read csv file into Numpty and skip 1st row
taxi = np.genfromtxt("nyc_taxis.csv", delimiter = ',', skip_header = 1)

In [6]:
#boolean array
a = np.array([1, 2, 3, 4, 5])
b = np.array(["blue", "blue", "red", "blue"])
c = np.array([80.0, 103.4, 96.9, 200.3])

In [7]:
a

array([1, 2, 3, 4, 5])

In [8]:
b

array(['blue', 'blue', 'red', 'blue'],
      dtype='<U4')

In [9]:
c

array([  80. ,  103.4,   96.9,  200.3])

In [10]:
a_bool = np.array(a < 3)

In [11]:
a_bool

array([ True,  True, False, False, False], dtype=bool)

In [12]:
b_bool = np.array(b == "blue")

In [13]:
b_bool

array([ True,  True, False,  True], dtype=bool)

In [14]:
c_bool = np.array(c > 100)

In [15]:
c_bool

array([False,  True, False,  True], dtype=bool)

In [17]:
#boolean indexing by using [] brackets
pickup_month = taxi[:,1]


In [18]:
january_bool = pickup_month == 1

In [19]:
january_bool

array([ True,  True,  True, ..., False, False, False], dtype=bool)

In [20]:
january = pickup_month[january_bool]

In [21]:
january

array([ 1.,  1.,  1., ...,  1.,  1.,  1.])

In [22]:
january_rides = january.shape[0]

In [23]:
january_rides

13481

In [24]:
february_bool = pickup_month == 2

In [25]:
february_bool

array([False, False, False, ..., False, False, False], dtype=bool)

In [26]:
february = pickup_month[february_bool]

In [27]:
february

array([ 2.,  2.,  2., ...,  2.,  2.,  2.])

In [28]:
february_rides = february.shape[0]

In [29]:
february_rides

13333

In [30]:
#calculate the average speed
trip_mph = taxi[:,7] / (taxi[:, 8] / 3600)

In [31]:
trip_mph

array([ 37.11340206,  38.58157895,  31.27222982, ...,  22.29907867,
        42.41551247,  36.90473407])

In [32]:
trip_mph_bool = trip_mph > 20000

In [33]:
trip_mph_bool

array([False, False, False, ..., False, False, False], dtype=bool)

In [36]:
trips_over_20000_mph = taxi[trip_mph_bool, 5:9]

In [37]:
print(trips_over_20000_mph)

[[  2.    2.   23.    1. ]
 [  2.    2.   19.6   1. ]
 [  2.    2.   16.7   2. ]
 [  3.    3.   17.8   2. ]
 [  2.    2.   17.2   2. ]
 [  3.    3.   16.9   3. ]
 [  2.    2.   27.1   4. ]]


In [38]:
#tip amount
tip_amount = taxi[:,12]

In [39]:
tip_bool = tip_amount > 50

In [40]:
tip_bool

array([False, False, False, ..., False, False, False], dtype=bool)

In [41]:
top_tips = taxi[tip_bool, 5:14]

In [42]:
top_tips

array([[  4.00000000e+00,   2.00000000e+00,   2.14500000e+01,
          2.00400000e+03,   5.20000000e+01,   8.00000000e-01,
          0.00000000e+00,   5.28000000e+01,   1.05600000e+02],
       [  3.00000000e+00,   4.00000000e+00,   9.20000000e+00,
          1.04100000e+03,   2.70000000e+01,   1.30000000e+00,
          5.54000000e+00,   6.00000000e+01,   9.38400000e+01],
       [  2.00000000e+00,   0.00000000e+00,   1.98000000e+01,
          1.67100000e+03,   5.25000000e+01,   1.30000000e+00,
          5.54000000e+00,   5.93400000e+01,   1.18680000e+02],
       [  4.00000000e+00,   2.00000000e+00,   1.84200000e+01,
          2.96800000e+03,   5.20000000e+01,   8.00000000e-01,
          5.54000000e+00,   8.00000000e+01,   1.38340000e+02],
       [  3.00000000e+00,   6.00000000e+00,   4.90000000e-01,
          1.58000000e+02,   3.50000000e+00,   1.80000000e+00,
          0.00000000e+00,   7.00000000e+01,   7.53000000e+01],
       [  2.00000000e+00,   2.00000000e+00,   2.70000000e+00,
   

In [43]:
#pseudocode = ndarray[location_of_values] = new_values
a = np.array(['red', 'blue', 'black', 'blue', 'purple'])

In [44]:
a[0] = 'orange'

In [45]:
print(a)

['orange' 'blue' 'black' 'blue' 'purple']


In [46]:
a[3:] = 'pink'

In [47]:
print(a)

['orange' 'blue' 'black' 'pink' 'pink']


In [48]:
#2d ndarray
ones = np.array([[1, 1, 1, 1, 1], [1, 1, 1, 1, 1], [1, 1, 1, 1, 1]])

In [49]:
ones[1,2] = 99

In [50]:
print(ones)

[[ 1  1  1  1  1]
 [ 1  1 99  1  1]
 [ 1  1  1  1  1]]


In [51]:
ones[0] = 42

In [52]:
print(ones)

[[42 42 42 42 42]
 [ 1  1 99  1  1]
 [ 1  1  1  1  1]]


In [54]:
ones[2] = 88

In [55]:
print(ones)

[[42 42 42 42 42]
 [ 1  1 99  1  1]
 [88 88 88 88 88]]


In [56]:
ones[:,2] = 0

In [57]:
print(ones)

[[42 42  0 42 42]
 [ 1  1  0  1  1]
 [88 88  0 88 88]]


In [58]:
#this creates a copy of our taxi ndarray
taxi_modified = taxi.copy()

In [59]:
taxi_modified

array([[  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          1.16500000e+01,   6.99900000e+01,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          8.00000000e+00,   5.43000000e+01,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   3.78000000e+01,   2.00000000e+00],
       ..., 
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          5.00000000e+00,   6.33400000e+01,   1.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          8.95000000e+00,   4.47500000e+01,   1.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          0.00000000e+00,   5.48400000e+01,   2.00000000e+00]])

In [60]:
taxi_modified[28214,5] = 1

In [61]:
taxi_modified

array([[  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          1.16500000e+01,   6.99900000e+01,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          8.00000000e+00,   5.43000000e+01,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   3.78000000e+01,   2.00000000e+00],
       ..., 
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          5.00000000e+00,   6.33400000e+01,   1.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          8.95000000e+00,   4.47500000e+01,   1.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          0.00000000e+00,   5.48400000e+01,   2.00000000e+00]])

In [62]:
taxi_modified[:, 0] = 16

In [63]:
taxi_modified

array([[ 16.  ,   1.  ,   1.  , ...,  11.65,  69.99,   1.  ],
       [ 16.  ,   1.  ,   1.  , ...,   8.  ,  54.3 ,   1.  ],
       [ 16.  ,   1.  ,   1.  , ...,   0.  ,  37.8 ,   2.  ],
       ..., 
       [ 16.  ,   6.  ,  30.  , ...,   5.  ,  63.34,   1.  ],
       [ 16.  ,   6.  ,  30.  , ...,   8.95,  44.75,   1.  ],
       [ 16.  ,   6.  ,  30.  , ...,   0.  ,  54.84,   2.  ]])

In [64]:
taxi_modified[1800:1802, 7] = np.mean(taxi_modified[:,7])

In [65]:
taxi_modified

array([[ 16.  ,   1.  ,   1.  , ...,  11.65,  69.99,   1.  ],
       [ 16.  ,   1.  ,   1.  , ...,   8.  ,  54.3 ,   1.  ],
       [ 16.  ,   1.  ,   1.  , ...,   0.  ,  37.8 ,   2.  ],
       ..., 
       [ 16.  ,   6.  ,  30.  , ...,   5.  ,  63.34,   1.  ],
       [ 16.  ,   6.  ,  30.  , ...,   8.95,  44.75,   1.  ],
       [ 16.  ,   6.  ,  30.  , ...,   0.  ,  54.84,   2.  ]])

In [66]:
#boolean arrays
a2 = np.array([1, 2, 3, 4, 5])

In [67]:
a2_bool = a2 > 2

In [68]:
a2_bool

array([False, False,  True,  True,  True], dtype=bool)

In [69]:
a2[a2_bool] = 99

In [70]:
print(a2)

[ 1  2 99 99 99]


In [72]:
#this creates a copy of out taxi ndarray
taxi_copy = taxi.copy()

In [73]:
total_amount = taxi_copy[:,13]

In [74]:
total_amount

array([ 69.99,  54.3 ,  37.8 , ...,  63.34,  44.75,  54.84])

In [75]:
total_amount[total_amount < 0] = 0

In [76]:
total_amount

array([ 69.99,  54.3 ,  37.8 , ...,  63.34,  44.75,  54.84])

#pseudocode for changing 1 column's value

bool = array[:, column_for_comparison] == value_for_comparison
array[bool, column_for_assignment] = new_value

#pseudocode for all in one line
array[array[:, column_for_comparison] == value_for_comparison, column_for_assignment] = new_value


In [77]:
#creates a new column filled with '0'
zeros = np.zeros([taxi.shape[0], 1])
taxi_modified = np.concatenate([taxi, zeros], axis=1)

In [78]:
print(taxi_modified)

[[  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   6.99900000e+01
    1.00000000e+00   0.00000000e+00]
 [  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   5.43000000e+01
    1.00000000e+00   0.00000000e+00]
 [  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   3.78000000e+01
    2.00000000e+00   0.00000000e+00]
 ..., 
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   6.33400000e+01
    1.00000000e+00   0.00000000e+00]
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   4.47500000e+01
    1.00000000e+00   0.00000000e+00]
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   5.48400000e+01
    2.00000000e+00   0.00000000e+00]]


In [79]:
#array[array[:, column_for_comparison] == value_for_comparison, column_for_assignment] = new_value
taxi_modified[taxi_modified[:,5] == 2, 15] = 1

In [81]:
print(taxi_modified)

[[  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   6.99900000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   5.43000000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   3.78000000e+01
    2.00000000e+00   1.00000000e+00]
 ..., 
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   6.33400000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   4.47500000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   5.48400000e+01
    2.00000000e+00   1.00000000e+00]]


In [82]:
taxi_modified[taxi_modified[:,5] == 3, 15] = 1

In [83]:
print(taxi_modified)

[[  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   6.99900000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   5.43000000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   3.78000000e+01
    2.00000000e+00   1.00000000e+00]
 ..., 
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   6.33400000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   4.47500000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   5.48400000e+01
    2.00000000e+00   1.00000000e+00]]


In [84]:
taxi_modified[taxi_modified[:, 5] == 5, 15] = 1

In [85]:
print(taxi_modified)

[[  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   6.99900000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   5.43000000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   1.00000000e+00   1.00000000e+00 ...,   3.78000000e+01
    2.00000000e+00   1.00000000e+00]
 ..., 
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   6.33400000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   4.47500000e+01
    1.00000000e+00   1.00000000e+00]
 [  2.01600000e+03   6.00000000e+00   3.00000000e+01 ...,   5.48400000e+01
    2.00000000e+00   1.00000000e+00]]


In [86]:
#which airport is the most popular destination (JFK, LaGuardia, Newark)
jfk = taxi[taxi[:,6] == 2]


In [87]:
jfk

array([[  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          5.28000000e+01,   1.05600000e+02,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   3.73000000e+01,   2.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   5.83400000e+01,   2.00000000e+00],
       ..., 
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          0.00000000e+00,   3.80000000e+00,   2.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          0.00000000e+00,   5.83400000e+01,   2.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          5.66000000e+00,   6.40000000e+01,   1.00000000e+00]])

In [88]:
jfk = taxi[taxi[:, 6] == 2]

In [90]:
jfk

array([[  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          5.28000000e+01,   1.05600000e+02,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   3.73000000e+01,   2.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   5.83400000e+01,   2.00000000e+00],
       ..., 
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          0.00000000e+00,   3.80000000e+00,   2.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          0.00000000e+00,   5.83400000e+01,   2.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          5.66000000e+00,   6.40000000e+01,   1.00000000e+00]])

In [91]:
jfk_count = jfk.shape[0]

In [92]:
jfk_count

11832

In [93]:
lagurdia = taxi[taxi[:, 6] == 3]

In [94]:
lagurdia

array([[  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   1.83000000e+01,   2.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          6.07000000e+00,   3.64100000e+01,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          1.00800000e+01,   5.04200000e+01,   1.00000000e+00],
       ..., 
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          0.00000000e+00,   3.73000000e+01,   2.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          3.70000000e+00,   3.50000000e+01,   1.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          7.00000000e+00,   5.83400000e+01,   1.00000000e+00]])

In [99]:
lagurdia_count = lagurdia.shape[0]

In [100]:
lagurdia_count

16602

In [101]:
newark = taxi[taxi[:, 6] == 5]

In [102]:
newark

array([[  2.01600000e+03,   1.00000000e+00,   1.00000000e+00,
          5.00000000e+00,   0.00000000e+00,   2.00000000e+00,
          5.00000000e+00,   3.63000000e+01,   2.56200000e+03,
          1.09500000e+02,   8.00000000e-01,   1.10800000e+01,
          1.00000000e+01,   1.31380000e+02,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   2.00000000e+00,
          6.00000000e+00,   3.00000000e+00,   2.00000000e+00,
          5.00000000e+00,   3.44000000e+01,   6.11400000e+03,
          1.22000000e+02,   3.00000000e-01,   1.25000000e+01,
          1.70000000e-01,   1.34970000e+02,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   4.00000000e+00,
          1.00000000e+00,   4.00000000e+00,   2.00000000e+00,
          5.00000000e+00,   3.73000000e+01,   4.76000000e+03,
          1.17000000e+02,   1.30000000e+00,   2.35800000e+01,
          0.00000000e+00,   1.41880000e+02,   2.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   5.00000000e+00,
     

In [103]:
newark_count = newark.shape[0]

In [104]:
newark_count

63

#cleaning and removing bad data
trip_distance @ column index 7
trip_length @ column index 8
total_amount @ column index 13

In [105]:
trip_mph = taxi[:,7] / (taxi[:,8] / 3600)

In [106]:
cleaned_taxi = taxi[trip_mph < 100]

In [107]:
cleaned_taxi

array([[  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          1.16500000e+01,   6.99900000e+01,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          8.00000000e+00,   5.43000000e+01,   1.00000000e+00],
       [  2.01600000e+03,   1.00000000e+00,   1.00000000e+00, ...,
          0.00000000e+00,   3.78000000e+01,   2.00000000e+00],
       ..., 
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          5.00000000e+00,   6.33400000e+01,   1.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          8.95000000e+00,   4.47500000e+01,   1.00000000e+00],
       [  2.01600000e+03,   6.00000000e+00,   3.00000000e+01, ...,
          0.00000000e+00,   5.48400000e+01,   2.00000000e+00]])

In [110]:
mean_distance = cleaned_taxi[:,7].mean()

In [111]:
mean_distance

12.666396599932893

In [114]:
mean_length = cleaned_taxi[:, 8].mean()

In [115]:
mean_length

2239.503657309026

In [116]:
mean_total_amount = cleaned_taxi[:,13].mean()

In [117]:
mean_total_amount

48.981318532602621