In [1]:
# Dependencies and setup
import pandas as pd
from numpy import cos, sin, arcsin, sqrt
from math import radians

In [2]:
# Load NYC January 2019 file.
jan_to_load = "201901-citibike-tripdata.csv.zip"

In [3]:
# Read data file and store into Pandas dataframe.
jan_df = pd.read_csv(jan_to_load, compression="zip")
jan_df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.996430,21579,Subscriber,1990,1
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.993790,503.0,E 20 St & Park Ave,40.738274,-73.987520,35379,Subscriber,1979,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,291,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,2
967283,437,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,2
967284,173,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,1
967285,244,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,1


In [4]:
# Drop null values in start and end station ID columns. 
jan_df_drop_null = jan_df.dropna()
jan_df_drop_null

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.996430,21579,Subscriber,1990,1
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.993790,503.0,E 20 St & Park Ave,40.738274,-73.987520,35379,Subscriber,1979,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,291,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,2
967283,437,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,2
967284,173,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,1
967285,244,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,1


In [5]:
# Drop birth years 1886-1901.
# In 2019, the oldest person living in the U.S. was 114 years old.
jan_df_drop_null_and_birth_yrs = jan_df_drop_null[jan_df_drop_null["birth year"]>1901]
jan_df_drop_null_and_birth_yrs

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.996430,21579,Subscriber,1990,1
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.993790,503.0,E 20 St & Park Ave,40.738274,-73.987520,35379,Subscriber,1979,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,291,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,2
967283,437,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,2
967284,173,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,1
967285,244,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,1


In [6]:
# Calculate age and place in a new column in the jan_df_drop_null_and_birth_yrs dataframe.
jan_df_drop_null_and_birth_yrs["age"] = 2018 - jan_df_drop_null_and_birth_yrs["birth year"]
jan_df_drop_null_and_birth_yrs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jan_df_drop_null_and_birth_yrs["age"] = 2018 - jan_df_drop_null_and_birth_yrs["birth year"]


Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,age
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1,47
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1,54
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1,31
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.996430,21579,Subscriber,1990,1,28
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.993790,503.0,E 20 St & Park Ave,40.738274,-73.987520,35379,Subscriber,1979,1,39
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,291,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,2,40
967283,437,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,2,42
967284,173,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,1,41
967285,244,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,1,24


In [7]:
# Calculate the average distance in miles for a bike trip.
def haversine(row):
    lon1 = row["start station longitude"]
    lat1 = row["start station latitude"]
    lon2 = row["end station longitude"]
    lat2 = row["end station latitude"]
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * arcsin(sqrt(a)) 
    km = 6367 * c
    return km

In [8]:
jan_df_drop_null_and_birth_yrs["trip distance km"] = jan_df_drop_null_and_birth_yrs.apply(lambda row: haversine(row), axis=1)
jan_df_drop_null_and_birth_yrs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jan_df_drop_null_and_birth_yrs["trip distance km"] = jan_df_drop_null_and_birth_yrs.apply(lambda row: haversine(row), axis=1)


Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,age,trip distance km
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1,47,1.065821
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1,54,0.577360
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1,31,2.032736
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.996430,21579,Subscriber,1990,1,28,1.402486
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.993790,503.0,E 20 St & Park Ave,40.738274,-73.987520,35379,Subscriber,1979,1,39,1.315246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,291,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,2,40,0.925856
967283,437,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,2,42,1.016765
967284,173,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,1,41,0.794507
967285,244,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,1,24,0.791014


In [9]:
# Convert the "trip distance" column from km to mi. 
jan_df_drop_null_and_birth_yrs["trip distance mi"] = jan_df_drop_null_and_birth_yrs["trip distance km"] * 0.621371
jan_df_drop_null_and_birth_yrs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jan_df_drop_null_and_birth_yrs["trip distance mi"] = jan_df_drop_null_and_birth_yrs["trip distance km"] * 0.621371


Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,age,trip distance km,trip distance mi
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1,47,1.065821,0.662270
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1,54,0.577360,0.358754
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1,31,2.032736,1.263083
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.996430,21579,Subscriber,1990,1,28,1.402486,0.871464
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.993790,503.0,E 20 St & Park Ave,40.738274,-73.987520,35379,Subscriber,1979,1,39,1.315246,0.817256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,291,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,2,40,0.925856,0.575300
967283,437,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,2,42,1.016765,0.631788
967284,173,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,1,41,0.794507,0.493684
967285,244,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,1,24,0.791014,0.491513


In [10]:
# Drop trip distances equivalent to zero. 
jan_df_drop_null_and_birth_yrs = jan_df_drop_null_and_birth_yrs.loc[jan_df_drop_null_and_birth_yrs["trip distance mi"] !=0]
jan_df_drop_null_and_birth_yrs

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,age,trip distance km,trip distance mi
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1,47,1.065821,0.662270
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1,54,0.577360,0.358754
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1,31,2.032736,1.263083
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.996430,21579,Subscriber,1990,1,28,1.402486,0.871464
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.993790,503.0,E 20 St & Park Ave,40.738274,-73.987520,35379,Subscriber,1979,1,39,1.315246,0.817256
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,291,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,2,40,0.925856,0.575300
967283,437,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,2,42,1.016765,0.631788
967284,173,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,1,41,0.794507,0.493684
967285,244,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,1,24,0.791014,0.491513


In [11]:
# Add a column that converts trip duration from seconds in the first column to minutes. 
jan_df_drop_null_and_birth_yrs["tripduration minutes"] = jan_df_drop_null_and_birth_yrs["tripduration"]/60
jan_df_drop_null_and_birth_yrs

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  jan_df_drop_null_and_birth_yrs["tripduration minutes"] = jan_df_drop_null_and_birth_yrs["tripduration"]/60


Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender,age,trip distance km,trip distance mi,tripduration minutes
0,320,2019-01-01 00:01:47.4010,2019-01-01 00:07:07.5810,3160.0,Central Park West & W 76 St,40.778968,-73.973747,3283.0,W 89 St & Columbus Ave,40.788221,-73.970416,15839,Subscriber,1971,1,47,1.065821,0.662270,5.333333
1,316,2019-01-01 00:04:43.7360,2019-01-01 00:10:00.6080,519.0,Pershing Square North,40.751873,-73.977706,518.0,E 39 St & 2 Ave,40.747804,-73.973442,32723,Subscriber,1964,1,54,0.577360,0.358754,5.266667
2,591,2019-01-01 00:06:03.9970,2019-01-01 00:15:55.4380,3171.0,Amsterdam Ave & W 82 St,40.785247,-73.976673,3154.0,E 77 St & 3 Ave,40.773142,-73.958562,27451,Subscriber,1987,1,31,2.032736,1.263083,9.850000
3,2719,2019-01-01 00:07:03.5450,2019-01-01 00:52:22.6500,504.0,1 Ave & E 16 St,40.732219,-73.981656,3709.0,W 15 St & 6 Ave,40.738046,-73.996430,21579,Subscriber,1990,1,28,1.402486,0.871464,45.316667
4,303,2019-01-01 00:07:35.9450,2019-01-01 00:12:39.5020,229.0,Great Jones St,40.727434,-73.993790,503.0,E 20 St & Park Ave,40.738274,-73.987520,35379,Subscriber,1979,1,39,1.315246,0.817256,5.050000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
967282,291,2019-01-31 17:33:26.4490,2019-01-31 17:38:17.8940,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3423.0,West Drive & Prospect Park West,40.661063,-73.979453,19780,Subscriber,1978,2,40,0.925856,0.575300,4.850000
967283,437,2019-01-31 18:57:45.3380,2019-01-31 19:05:02.4970,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3313.0,6 Ave & 12 St,40.666318,-73.985462,34365,Subscriber,1976,2,42,1.016765,0.631788,7.283333
967284,173,2019-01-31 19:11:41.1930,2019-01-31 19:14:34.3350,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3418.0,Plaza St West & Flatbush Ave,40.675021,-73.971115,25889,Subscriber,1977,1,41,0.794507,0.493684,2.883333
967285,244,2019-01-31 20:54:51.1440,2019-01-31 20:58:55.3100,3354.0,3 St & Prospect Park West,40.668132,-73.973638,3346.0,Berkeley Pl & 7 Ave,40.675147,-73.975232,34649,Subscriber,1994,1,24,0.791014,0.491513,4.066667


In [12]:
# Calculate the average trip duration in minutes.
jan_df_drop_null_and_birth_yrs["tripduration minutes"].mean()

12.685556880472234

In [13]:
# Calculate how many trips were taken in January.
jan_df_drop_null_and_birth_yrs["tripduration"].count()

951003

In [14]:
# Calculate the average age of the consumer. 
jan_df_drop_null_and_birth_yrs["age"].mean()

39.08131414937703

In [15]:
# Calculate average trip distance
jan_df_drop_null_and_birth_yrs["trip distance mi"].mean()

0.9941833838060975

In [16]:
# Load new dataframe as a .csv
compression_opts = dict(method="zip", archive_name="clean_201901-citibike-tripdata.csv")
jan_df_drop_null_and_birth_yrs.to_csv("clean_201901-citibike-tripdata.zip", index=False, compression=compression_opts)