In [17]:
# Dependencies

import pandas as pd
import numpy as np
import glob
import os
import timeit                                # To check performance

# Instructions: 

The unified CSV file is nearly 5GBs. Github can't store it. If you try to push it, it will produce an error. 

If you want to work with the data, you can run this script, and it will save the unified CSV file in the sub folder "Gitignore-data-output-files."

BEFORE doing that, open your .gitignore file in the repository, and make sure the following is one of the line items to ignore:

"# Huge data files to ignore:

Resources/Data/Gitignore-data-output-files/ecobici_ride_data-unified.csv"

Save your .gitignore file, push that change. Then you should be able to produce the unified CSV file without worry of your machine trying to push it to the repository. -Ken

In [18]:
# Concatenate csv files

tic = timeit.default_timer()                            # Monitor performance

path = r'/Users/kennethandersen/Documents/GitHub/P2-Ecobici_insights_and_recommendations/Resources/Data'
all_files = glob.glob(path + "/*.csv")

csv_file_list = []
i = 1

for filename in all_files:
    try:
        df = pd.read_csv(filename, index_col=None, header=0)
        csv_file_list.append(df)
#         print (f'File {i} added')
    except:
        print(f'Concatenate rror {filename}, could not be added')
        pass
    i+= 1

ecobici_rides_df = pd.concat(csv_file_list, axis=0, ignore_index=True)

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to extract and concatenate excel files: {round(toc - tic, 2)}')
print(f'Number of rows: {len(ecobici_rides_df)}')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Time (in seconds) to extract and concatenate excel files: 117.79
Number of rows: 71327653


In [29]:
# Display dataframe
ecobici_rides_df

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39
...,...,...,...,...,...,...,...,...,...
71327647,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000
71327648,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000
71327649,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000
71327650,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000


## Notes

- One file (2020-12.csv) failed entirely. 
- There are some inconsistences in columns that lead to 6 extra columns on the right.
- Data is captured inconsistently (not that date stamps have different formatting, for example. 

We'll deal with those below in the Data Cleanup section. 

In [4]:
# Export unified CSV file to master CSV file for backup

# tic = timeit.default_timer()                            # Monitor performance

# ecobici_rides_df.to_csv("Gitignore-data-output-files/ecobici_ride_data-unified.csv", index=False) 

# toc = timeit.default_timer()                            # Monitor performance
# print(f'Time (in seconds) to export unified CSV file: {round(toc - tic, 2)}')


Time (in seconds) to export unified CSV file: 316.09


# Data Cleanup

The intention of following fields is to understand where the extra columns come from, and how to clean up data. 

In [19]:
# Create a copy to experiment with

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test = ecobici_rides_df.copy()

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test

Processing time (in seconds): 27.12


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,,,,
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,,,,
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,,,,
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,,,,


In [5]:
# Explore what column "Ciclo_EstacionArribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Ciclo_Estacion_Arribo was registered
# as Ciclo_EstacionArribo (missing underscore)

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Ciclo_EstacionArribo"])

toc = timeit.default_timer() 
print(f'{len(filtered_df)} rows')# Monitor performance
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

1445219 rows
Processing time (in seconds): 97.28


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
41819667,M,28.0,7170,217.0,28/02/2021,11:51:53,,,,6.0,01/03/2021,12:00:2,,,
41819668,M,40.0,6980,62.0,28/02/2021,11:00:41,,,,62.0,01/03/2021,12:03:33,,,
41819669,M,26.0,15303,257.0,28/02/2021,11:52:09,,,,257.0,01/03/2021,12:04:16,,,
41819670,F,73.0,11960,36.0,28/02/2021,11:59:53,,,,57.0,01/03/2021,12:04:39,,,
41819671,M,48.0,12401,53.0,01/03/2021,12:01:50,,,,38.0,01/03/2021,12:05:37,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58572207,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58572208,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58572209,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58572210,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [6]:
# Explore what column "Fecha Arribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Fecha_Arribo was registered
# as Fecha Arribo (missing underscore)

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Fecha Arribo"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 3.87


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
41819667,M,28.0,7170,217.0,28/02/2021,11:51:53,,,,6.0,01/03/2021,12:00:2,,,
41819668,M,40.0,6980,62.0,28/02/2021,11:00:41,,,,62.0,01/03/2021,12:03:33,,,
41819669,M,26.0,15303,257.0,28/02/2021,11:52:09,,,,257.0,01/03/2021,12:04:16,,,
41819670,F,73.0,11960,36.0,28/02/2021,11:59:53,,,,57.0,01/03/2021,12:04:39,,,
41819671,M,48.0,12401,53.0,01/03/2021,12:01:50,,,,38.0,01/03/2021,12:05:37,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58572207,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58572208,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58572209,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58572210,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [7]:
# Explore what column "Hora Arribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Hora_Arribo was registered
# as Fecha Arribo (missing underscore)


tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Hora Arribo"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 3.43


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
41819667,M,28.0,7170,217.0,28/02/2021,11:51:53,,,,6.0,01/03/2021,12:00:2,,,
41819668,M,40.0,6980,62.0,28/02/2021,11:00:41,,,,62.0,01/03/2021,12:03:33,,,
41819669,M,26.0,15303,257.0,28/02/2021,11:52:09,,,,257.0,01/03/2021,12:04:16,,,
41819670,F,73.0,11960,36.0,28/02/2021,11:59:53,,,,57.0,01/03/2021,12:04:39,,,
41819671,M,48.0,12401,53.0,01/03/2021,12:01:50,,,,38.0,01/03/2021,12:05:37,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58572207,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58572208,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58572209,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58572210,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [20]:
# Take all the data from 'Ciclo_EstacionArribo','Fecha Arribo', 'Hora Arribo' columns and shift it 3 columns
#  to the left

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test.iloc[41819667:58572212,9:12] = ecobici_rides_df_test.iloc[41819667:58572212,9:12].shift(-3,axis=1)
df

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test


Processing time (in seconds): 79.48


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,,,,
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,,,,
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,,,,
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,,,,


In [21]:
# Test to see if shift worked

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Fecha Arribo"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 3.61


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
58572212,M,27.0,10691,463.0,09/04/2021,08:05:03,,,,480.0,09/04/2021,08:19:59,,,
58572213,M,36.0,7318,142.0,09/04/2021,08:06:39,,,,125.0,09/04/2021,08:20:04,,,
58572214,M,28.0,9575,99.0,09/04/2021,08:03:11,,,,92.0,09/04/2021,08:20:10,,,
58572215,F,37.0,15320,459.0,09/04/2021,06:37:20,,,,241.0,09/04/2021,08:20:11,,,
58572216,F,29.0,6826,30.0,09/04/2021,08:15:20,,,,28.0,09/04/2021,08:20:11,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58829190,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58829191,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58829192,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58829193,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [11]:
# Explore what column "Hora_Retiro.1" comes from

# CONCLUSION In May and June 2021 the column Hora_Retiro was duplicated. It can be deleted 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Hora_Retiro.1"])

toc = timeit.default_timer() 

print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 4.02


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
48247850,,27.0,10961,141.0,31/05/2021,23:52:15,,,,,,,00:02:24,,
48247851,M,41.0,10381,254.0,31/05/2021,23:41:31,,,,,,,00:01:08,,
48247852,F,30.0,12040,49.0,31/05/2021,23:44:32,,,,,,,00:10:21,,
48247853,M,47.0,7709,153.0,31/05/2021,23:56:10,,,,,,,00:01:14,,
48247854,M,30.0,10255,435.0,31/05/2021,23:50:45,,,,,,,00:06:10,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48613841,M,33.0,7199,21.0,30/06/2021,23:24:15,,,,,,,23:31:08,,
48613842,M,42.0,12232,295.0,30/06/2021,23:29:29,,,,,,,23:48:45,,
48613843,M,37.0,10093,44.0,30/06/2021,23:29:07,,,,,,,23:39:03,,
48613844,,39.0,10558,54.0,30/06/2021,22:50:14,,,,,,,23:06:11,,


In [12]:
# Explore what column "Unnamed: 9" comes from

# CONCLUSION Unclear where this comes from, doesn't seem to matter. Only one row seems to be affected and it 
# seems to have an error in the Fecha_arribo cell. Both the row and the column can can be deleted 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Unnamed: 9"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 3.94


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
51668973,M,44.0,9944,267.0,06/03/2019,08:06:23,32:11.8,10,00:00:00,,,,,08:41:11,


In [22]:
# Remove problematic row index 51668973

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test.drop(51668973, inplace=True, axis=0)

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test

Processing time (in seconds): 49.79


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,,,,
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,,,,
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,,,,
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,,,,


In [23]:
# Explore what column "BikeID" comes from

# CONCLUSION: BikeID was only used in August 2020 and doesn't seem useful for this project. It can be deleted. 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["BikeID"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 2.03


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
61197896,M,25.0,4160,465.0,01/08/2020,6:08:43,463,01/08/2020,6:14:31,,,,,,4458.0
61197897,M,41.0,8057,167.0,01/08/2020,6:40:28,286,01/08/2020,6:44:41,,,,,,8328.0
61197898,,29.0,11018,208.0,01/08/2020,6:39:21,209,01/08/2020,6:48:36,,,,,,11502.0
61197899,M,43.0,12531,156.0,01/08/2020,7:44:01,363,01/08/2020,8:05:47,,,,,,14103.0
61197900,F,41.0,9746,474.0,01/08/2020,7:55:46,211,01/08/2020,8:09:25,,,,,,10207.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61464831,,25.0,2053,355.0,31/08/2020,19:10:10,359,31/08/2020,19:18:10,,,,,,2097.0
61464832,,27.0,6932,355.0,31/08/2020,19:03:42,154,31/08/2020,19:22:26,,,,,,7298.0
61464833,M,35.0,7184,382.0,31/08/2020,19:04:07,382,31/08/2020,19:24:55,,,,,,7537.0
61464834,F,31.0,9832,195.0,31/08/2020,18:57:51,319,31/08/2020,19:38:07,,,,,,10294.0


In [24]:
# Drop 6 columns now that they are empty or of no use. 

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test.drop(['Ciclo_EstacionArribo', 
                       'Fecha Arribo', 
                       'Hora Arribo', 
                       'Hora_Retiro.1', 
                       'Unnamed: 9', 
                       'BikeID'], 
                      inplace=True, axis=1)

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test

Processing time (in seconds): 18.93


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39
...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000


In [25]:
# Export clean CSV file to master CSV file

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test.to_csv("Gitignore-data-output-files/ecobici_ride_data-unified.csv", index=False) 

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to export unified CSV file: {round(toc - tic, 2)}')

ecobici_rides_df_test.head()

Time (in seconds) to export unified CSV file: 252.78


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39


In [26]:
# Start again from CSV backup

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df = pd.read_csv("Gitignore-data-output-files/ecobici_ride_data-unified.csv", index_col=None, header=0)

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to import CSV file: {round(toc - tic, 2)}')

ecobici_rides_df.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Time (in seconds) to import CSV file: 169.61


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39


In [27]:
# Determine data types in each column

for col in ecobici_rides_df.columns:
    print(f'{col}, type: {type(ecobici_rides_df_test.loc[0, col])}')
    

Genero_Usuario, type: <class 'str'>
Edad_Usuario, type: <class 'numpy.float64'>
Bici, type: <class 'int'>
Ciclo_Estacion_Retiro, type: <class 'numpy.float64'>
Fecha_Retiro, type: <class 'str'>
Hora_Retiro, type: <class 'str'>
Ciclo_Estacion_Arribo, type: <class 'int'>
Fecha_Arribo, type: <class 'str'>
Hora_Arribo, type: <class 'str'>


In [42]:
# # Grab a slice to experiment with

# ecobici_rides_df_slice = ecobici_rides_df.iloc[0:100000, :].copy()
# ecobici_rides_df_slice

In [None]:
# Add Retiro_Timestamp column

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_slice["Reitor_Timestamp"] = np.nan

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to add column {round(toc - tic, 2)}')

ecobici_rides_df.head()

In [46]:
ecobici_rides_df

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Retiro_Timestamp
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,
...,...,...,...,...,...,...,...,...,...,...
71327647,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,
71327648,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,
71327649,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,
71327650,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,


In [None]:
# Combine Fecha_Retiro and Hora_Retiro string and convert to datetime

tic = timeit.default_timer()                            # Monitor performance

for index, row in ecobici_rides_df.iterrows():
               
    try: 
        complete_datetime_str = row['Fecha_Retiro'] + ' ' + row['Hora_Retiro']
        
        ecobici_rides_df.at[index, 'Retiro_Timestamp'] = pd.to_datetime(complete_datetime_str)
        
        if index % 1000000 == 0: print(f'We\'re up to {index}!')
               
    except: 
        print(f'ERROR: ROW {index} DROPPED')
                                          
toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to process data: {round(toc - tic, 2)}')

ecobici_rides_df.head()

We're up to 0!
We're up to 1000000!
We're up to 2000000!
We're up to 3000000!
We're up to 4000000!
We're up to 5000000!
We're up to 6000000!
We're up to 7000000!
We're up to 8000000!
We're up to 9000000!
We're up to 10000000!
We're up to 11000000!
We're up to 12000000!
We're up to 13000000!
ERROR: ROW 13954681 DROPPED
ERROR: ROW 13958316 DROPPED
ERROR: ROW 13959136 DROPPED
ERROR: ROW 13973821 DROPPED
We're up to 14000000!
ERROR: ROW 14094556 DROPPED
ERROR: ROW 14094582 DROPPED
ERROR: ROW 14235767 DROPPED
We're up to 15000000!
We're up to 16000000!


In [35]:
# Add Arribo_Timestamp column

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df["Arribo_Timestamp"] = np.nan

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to add column {round(toc - tic, 2)}')

ecobici_rides_df.head()

Time (in seconds) to add column 0.0


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Retiro_Timestamp,Arribo_Timestamp
71227652,M,51.0,371,24.0,2011-07-15,12:15:42.097000,85,2011-07-15,12:27:28.323000,2011-07-15 12:15:42.097,
71227653,M,24.0,448,3.0,2011-07-15,12:16:01.347000,55,2011-07-15,12:28:25.167000,2011-07-15 12:16:01.347,
71227654,M,57.0,1210,6.0,2011-07-15,12:16:10.393000,18,2011-07-15,12:20:39.027000,2011-07-15 12:16:10.393,
71227655,M,28.0,660,52.0,2011-07-15,12:16:10.500000,55,2011-07-15,12:21:16.713000,2011-07-15 12:16:10.500,
71227656,F,31.0,437,44.0,2011-07-15,12:16:13.970000,37,2011-07-15,12:19:05.887000,2011-07-15 12:16:13.970,


In [40]:
# Combine Fecha_Arribo and Hora_Arribo strings and convert to datetime

tic = timeit.default_timer()                            # Monitor performance

for index, row in ecobici_rides_df.iterrows():
               
    try: 
        complete_datetime_str = row['Fecha_Arribo'] + ' ' + row['Hora_Arribo']
        
        ecobici_rides_df.at[index, 'Arribo_Timestamp'] = pd.to_datetime(complete_datetime_str)
        
        if index % 1000000 == 0: print(f'We\'re up to {index}!')
               
    except: 
        print(f'ERROR: ROW {index} DROPPED')
                                          
toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to process data: {round(toc - tic, 2)}')

ecobici_rides_df.head()

We're up to 0!
We're up to 10000!
We're up to 20000!
We're up to 30000!
We're up to 40000!
We're up to 50000!
We're up to 60000!
We're up to 70000!
We're up to 80000!
We're up to 90000!
Time (in seconds) to process data: 33.89


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Retiro_Timestamp,Arribo_Timestamp
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,2020-01-02 00:00:38,2020-01-02 00:35:17
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,2020-01-02 00:00:53,2020-01-02 00:06:23
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,2020-01-02 00:00:55,2020-01-02 00:26:47
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,2020-01-02 00:01:18,2020-01-02 00:16:51
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,2020-01-02 00:01:18,2020-01-02 00:12:39


In [41]:
# Drop 4 columns now that they are of no use.

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_slice.drop(['Fecha_Retiro',
                            'Hora_Retiro',
                            'Fecha_Arribo', 
                            'Hora_Arribo'], 
                      inplace=True, axis=1)

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_slice

Processing time (in seconds): 0.04


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,Retiro_Timestamp,Arribo_Timestamp
0,M,44.0,4357,442.0,116,2020-01-02 00:00:38,2020-01-02 00:35:17
1,M,22.0,12083,66.0,37,2020-01-02 00:00:53,2020-01-02 00:06:23
2,M,29.0,11562,331.0,341,2020-01-02 00:00:55,2020-01-02 00:26:47
3,M,27.0,10206,164.0,35,2020-01-02 00:01:18,2020-01-02 00:16:51
4,M,27.0,10101,120.0,47,2020-01-02 00:01:18,2020-01-02 00:12:39
...,...,...,...,...,...,...,...
99995,M,34.0,11769,439.0,340,2020-06-02 12:44:45,2020-06-02 12:58:47
99996,M,42.0,12501,305.0,169,2020-06-02 12:44:50,2020-06-02 12:52:56
99997,M,36.0,11580,398.0,295,2020-06-02 12:44:54,2020-06-02 12:58:47
99998,F,40.0,11714,67.0,58,2020-06-02 12:44:55,2020-06-02 12:54:15


In [None]:
# Export clean CSV file to master CSV file

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df.to_csv("Gitignore-data-output-files/ecobici_ride_data-unified.csv", index=False) 

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to export unified CSV file: {round(toc - tic, 2)}')

ecobici_rides_df.head()