In [97]:
# Dependencies

import pandas as pd
import numpy as np
import glob
import os
import timeit                                # To check performance
from datetime import datetime as dt

# Instructions: 

The unified CSV file is nearly 5GBs. Github can't store it. If you try to push it, it will produce an error. 

If you want to work with the data, you can run this script, and it will save the unified CSV file in the sub folder "Gitignore-data-output-files."

BEFORE doing that, open your .gitignore file in the repository, and make sure the following is one of the line items to ignore:

"# Huge data files to ignore:

Resources/Data/Gitignore-data-output-files/ecobici_ride_data-unified.csv"

Save your .gitignore file, push that change. Then you should be able to produce the unified CSV file without worry of your machine trying to push it to the repository. -Ken

In [42]:
# Concatenate csv files

tic = timeit.default_timer()                            # Monitor performance

path = r'/Users/kennethandersen/Documents/GitHub/P2-Ecobici_insights_and_recommendations/Resources/Data'
all_files = glob.glob(path + "/*.csv")

csv_file_list = []
i = 1

for filename in all_files:
    try:
        df = pd.read_csv(filename, index_col=None, header=0)
        csv_file_list.append(df)
#         print (f'File {i} added')
    except:
        print(f'Concatenate error {filename}, could not be added')
        pass
    i+= 1

ecobici_rides_df = pd.concat(csv_file_list, axis=0, ignore_index=True)

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to extract and concatenate excel files: {round(toc - tic, 2)}')
print(f'Number of rows: {len(ecobici_rides_df)}')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Time (in seconds) to extract and concatenate excel files: 108.74
Number of rows: 71327653


In [43]:
# Display dataframe
ecobici_rides_df

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,


## Notes

- One file (2020-12.csv) failed entirely. 
- There are some inconsistences in columns that lead to 6 extra columns on the right.
- Data is captured inconsistently (not that date stamps have different formatting, for example. 

We'll deal with those below in the Data Cleanup section. 

# Data Cleanup

The intention of following fields is to understand where the extra columns come from, and how to clean up data. 

In [44]:
# Create a copy to experiment with

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test = ecobici_rides_df.copy()

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test

Processing time (in seconds): 19.17


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,


In [20]:
# Explore what column "Ciclo_EstacionArribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Ciclo_Estacion_Arribo was registered
# as Ciclo_EstacionArribo (missing underscore)

# tic = timeit.default_timer()                            # Monitor performance

# filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Ciclo_EstacionArribo"])

# toc = timeit.default_timer() 
# print(f'{len(filtered_df)} rows')# Monitor performance
# print(f'Processing time (in seconds): {round(toc - tic, 2)}')

# filtered_df

742592 rows
Processing time (in seconds): 1.41


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo
48504833,,27.0,10961,141.0,31/05/2021,23:52:15,,,,56.0,01/06/2021,
48504834,M,41.0,10381,254.0,31/05/2021,23:41:31,,,,103.0,01/06/2021,
48504835,F,30.0,12040,49.0,31/05/2021,23:44:32,,,,181.0,01/06/2021,
48504836,M,47.0,7709,153.0,31/05/2021,23:56:10,,,,136.0,01/06/2021,
48504837,M,30.0,10255,435.0,31/05/2021,23:50:45,,,,299.0,01/06/2021,
...,...,...,...,...,...,...,...,...,...,...,...,...
54578312,M,29.0,7972,276.0,31/05/2021,11:52:48,,,,295.0,31/05/2021,11:57:21
54578313,M,35.0,10999,276.0,31/05/2021,11:52:38,,,,295.0,31/05/2021,11:57:38
54578314,M,29.0,8512,381.0,31/05/2021,11:51:53,,,,372.0,31/05/2021,11:57:51
54578315,F,31.0,7839,146.0,31/05/2021,11:52:24,,,,73.0,31/05/2021,11:59:31


In [6]:
# Explore what column "Fecha Arribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Fecha_Arribo was registered
# as Fecha Arribo (missing underscore)

# tic = timeit.default_timer()                            # Monitor performance

# filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Fecha Arribo"])

# toc = timeit.default_timer() 
# print(f'Processing time (in seconds): {round(toc - tic, 2)}')

# filtered_df

Processing time (in seconds): 3.87


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
41819667,M,28.0,7170,217.0,28/02/2021,11:51:53,,,,6.0,01/03/2021,12:00:2,,,
41819668,M,40.0,6980,62.0,28/02/2021,11:00:41,,,,62.0,01/03/2021,12:03:33,,,
41819669,M,26.0,15303,257.0,28/02/2021,11:52:09,,,,257.0,01/03/2021,12:04:16,,,
41819670,F,73.0,11960,36.0,28/02/2021,11:59:53,,,,57.0,01/03/2021,12:04:39,,,
41819671,M,48.0,12401,53.0,01/03/2021,12:01:50,,,,38.0,01/03/2021,12:05:37,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58572207,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58572208,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58572209,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58572210,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [7]:
# Explore what column "Hora Arribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Hora_Arribo was registered
# as Fecha Arribo (missing underscore)


# tic = timeit.default_timer()                            # Monitor performance

# filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Hora Arribo"])

# toc = timeit.default_timer() 
# print(f'Processing time (in seconds): {round(toc - tic, 2)}')

# filtered_df

Processing time (in seconds): 3.43


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
41819667,M,28.0,7170,217.0,28/02/2021,11:51:53,,,,6.0,01/03/2021,12:00:2,,,
41819668,M,40.0,6980,62.0,28/02/2021,11:00:41,,,,62.0,01/03/2021,12:03:33,,,
41819669,M,26.0,15303,257.0,28/02/2021,11:52:09,,,,257.0,01/03/2021,12:04:16,,,
41819670,F,73.0,11960,36.0,28/02/2021,11:59:53,,,,57.0,01/03/2021,12:04:39,,,
41819671,M,48.0,12401,53.0,01/03/2021,12:01:50,,,,38.0,01/03/2021,12:05:37,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58572207,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58572208,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58572209,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58572210,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [20]:
# # Take all the data from 'Ciclo_EstacionArribo','Fecha Arribo', 'Hora Arribo' columns and shift it 3 columns
# #  to the left

# tic = timeit.default_timer()                            # Monitor performance

# ecobici_rides_df_test.iloc[41819667:58572212,9:12] = ecobici_rides_df_test.iloc[41819667:58572212,9:12].shift(-3,axis=1)
# df

# toc = timeit.default_timer() 
# print(f'Processing time (in seconds): {round(toc - tic, 2)}')

# ecobici_rides_df_test


Processing time (in seconds): 79.48


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,,,,
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,,,,
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,,,,
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,,,,


In [21]:
# Test to see if shift worked

# tic = timeit.default_timer()                            # Monitor performance

# filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Fecha Arribo"])

# toc = timeit.default_timer() 
# print(f'Processing time (in seconds): {round(toc - tic, 2)}')

# filtered_df

Processing time (in seconds): 3.61


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
58572212,M,27.0,10691,463.0,09/04/2021,08:05:03,,,,480.0,09/04/2021,08:19:59,,,
58572213,M,36.0,7318,142.0,09/04/2021,08:06:39,,,,125.0,09/04/2021,08:20:04,,,
58572214,M,28.0,9575,99.0,09/04/2021,08:03:11,,,,92.0,09/04/2021,08:20:10,,,
58572215,F,37.0,15320,459.0,09/04/2021,06:37:20,,,,241.0,09/04/2021,08:20:11,,,
58572216,F,29.0,6826,30.0,09/04/2021,08:15:20,,,,28.0,09/04/2021,08:20:11,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58829190,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58829191,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58829192,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58829193,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [26]:
# Explore what column "Hora_Retiro.1" comes from

# CONCLUSION In May and June 2021 the column Hora_Retiro was duplicated. It can be deleted 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Hora_Retiro.1"])

toc = timeit.default_timer() 

print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 52.11


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
48504833,,27.0,10961,141.0,31/05/2021,23:52:15,56,01/06/2021,,00:02:24,,
48504834,M,41.0,10381,254.0,31/05/2021,23:41:31,103,01/06/2021,,00:01:08,,
48504835,F,30.0,12040,49.0,31/05/2021,23:44:32,181,01/06/2021,,00:10:21,,
48504836,M,47.0,7709,153.0,31/05/2021,23:56:10,136,01/06/2021,,00:01:14,,
48504837,M,30.0,10255,435.0,31/05/2021,23:50:45,299,01/06/2021,,00:06:10,,
...,...,...,...,...,...,...,...,...,...,...,...,...
48870824,M,33.0,7199,21.0,30/06/2021,23:24:15,1,30/06/2021,,23:31:08,,
48870825,M,42.0,12232,295.0,30/06/2021,23:29:29,385,30/06/2021,,23:48:45,,
48870826,M,37.0,10093,44.0,30/06/2021,23:29:07,182,30/06/2021,,23:39:03,,
48870827,,39.0,10558,54.0,30/06/2021,22:50:14,132,30/06/2021,,23:06:11,,


In [45]:
# Explore what column "Unnamed: 9" comes from

# CONCLUSION Unclear where this comes from, doesn't seem to matter. Only one row seems to be affected and it 
# seems to have an error in the Fecha_arribo cell. Both the row and the column can can be deleted 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Unnamed: 9"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 42.49


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
51925956,M,44.0,9944,267.0,06/03/2019,08:06:23,32:11.8,10,00:00:00,,08:41:11,


In [46]:
# Remove problematic row index 51925956

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test.drop(51925956, inplace=True, axis=0)

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test

Processing time (in seconds): 15.46


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,


In [29]:
# Explore what column "BikeID" comes from

# CONCLUSION: BikeID was only used in August 2020 and doesn't seem useful for this project. It can be deleted. 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["BikeID"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 1.7


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
61197896,M,25.0,4160,465.0,01/08/2020,6:08:43,463,01/08/2020,6:14:31,,,4458.0
61197897,M,41.0,8057,167.0,01/08/2020,6:40:28,286,01/08/2020,6:44:41,,,8328.0
61197898,,29.0,11018,208.0,01/08/2020,6:39:21,209,01/08/2020,6:48:36,,,11502.0
61197899,M,43.0,12531,156.0,01/08/2020,7:44:01,363,01/08/2020,8:05:47,,,14103.0
61197900,F,41.0,9746,474.0,01/08/2020,7:55:46,211,01/08/2020,8:09:25,,,10207.0
...,...,...,...,...,...,...,...,...,...,...,...,...
61464831,,25.0,2053,355.0,31/08/2020,19:10:10,359,31/08/2020,19:18:10,,,2097.0
61464832,,27.0,6932,355.0,31/08/2020,19:03:42,154,31/08/2020,19:22:26,,,7298.0
61464833,M,35.0,7184,382.0,31/08/2020,19:04:07,382,31/08/2020,19:24:55,,,7537.0
61464834,F,31.0,9832,195.0,31/08/2020,18:57:51,319,31/08/2020,19:38:07,,,10294.0


In [47]:
# Drop 6 columns now that they are empty or of no use. 

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test.drop(['Hora_Retiro.1',
#                             'Ciclo_EstacionArribo', 
#                            'Fecha Arribo', 
#                            'Hora Arribo', 
                           'Unnamed: 9', 
                           'BikeID'], 
                      inplace=True, axis=1)

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test

Processing time (in seconds): 7.4


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39
...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000


In [48]:
# Update original dataframe not that the data looks solid

ecobici_rides_df_test2 = ecobici_rides_df_test.dropna().copy()

ecobici_rides_df_test2

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39
...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000


In [49]:
tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test2['Fecha_Retiro'] = pd.to_datetime(ecobici_rides_df_test2['Fecha_Retiro'])
ecobici_rides_df_test2['Fecha_Arribo'] = pd.to_datetime(ecobici_rides_df_test2['Fecha_Arribo'])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test2

Processing time (in seconds): 20.16


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,2020-01-02,0:00:38,116,2020-01-02,0:35:17
1,M,22.0,12083,66.0,2020-01-02,0:00:53,37,2020-01-02,0:06:23
2,M,29.0,11562,331.0,2020-01-02,0:00:55,341,2020-01-02,0:26:47
3,M,27.0,10206,164.0,2020-01-02,0:01:18,35,2020-01-02,0:16:51
4,M,27.0,10101,120.0,2020-01-02,0:01:18,47,2020-01-02,0:12:39
...,...,...,...,...,...,...,...,...,...
71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000
71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000
71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000
71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000


In [61]:
# Grab a slice to experiment with

ecobici_rides_df_test3 = ecobici_rides_df_test2.dropna().copy()

ecobici_rides_df_test3

In [63]:
# Export clean CSV file to master CSV file

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test3.to_csv("Gitignore-data-output-files/ecobici_ride_data-unified.csv", index=False) 

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to export unified CSV file: {round(toc - tic, 2)}')

ecobici_rides_df_test3.head()

Time (in seconds) to export unified CSV file: 893.23


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,2020-01-02,0:00:38,116,2020-01-02,0:35:17
1,M,22.0,12083,66.0,2020-01-02,0:00:53,37,2020-01-02,0:06:23
2,M,29.0,11562,331.0,2020-01-02,0:00:55,341,2020-01-02,0:26:47
3,M,27.0,10206,164.0,2020-01-02,0:01:18,35,2020-01-02,0:16:51
4,M,27.0,10101,120.0,2020-01-02,0:01:18,47,2020-01-02,0:12:39


In [66]:
ecobici_rides_df_test3.reset_index(inplace=True)
ecobici_rides_df_test3

Unnamed: 0,index,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,0,M,44.0,4357,442.0,2020-01-02,0:00:38,116,2020-01-02,0:35:17
1,1,M,22.0,12083,66.0,2020-01-02,0:00:53,37,2020-01-02,0:06:23
2,2,M,29.0,11562,331.0,2020-01-02,0:00:55,341,2020-01-02,0:26:47
3,3,M,27.0,10206,164.0,2020-01-02,0:01:18,35,2020-01-02,0:16:51
4,4,M,27.0,10101,120.0,2020-01-02,0:01:18,47,2020-01-02,0:12:39
...,...,...,...,...,...,...,...,...,...,...
70703892,71327648,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000
70703893,71327649,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000
70703894,71327650,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000
70703895,71327651,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000


In [113]:
ecobici_rides_df_test3.drop('index', inplace=True, axis=1)
ecobici_rides_df_test3

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,2020-01-02,0:00:38,116,2020-01-02,0:35:17
1,M,22.0,12083,66.0,2020-01-02,0:00:53,37,2020-01-02,0:06:23
2,M,29.0,11562,331.0,2020-01-02,0:00:55,341,2020-01-02,0:26:47
3,M,27.0,10206,164.0,2020-01-02,0:01:18,35,2020-01-02,0:16:51
4,M,27.0,10101,120.0,2020-01-02,0:01:18,47,2020-01-02,0:12:39
...,...,...,...,...,...,...,...,...,...
70703892,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000
70703893,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000
70703894,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000
70703895,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000


In [114]:
ecobici_rides_df_sorted = ecobici_rides_df_test3.sort_values(by=['Fecha_Arribo']).copy()
ecobici_rides_df_sorted

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
29095879,M,28.0,69,85.0,2010-02-16,12:42:32.160000,85,2010-02-16,12:45:37.427000
29095880,M,30.0,11,85.0,2010-02-16,12:53:29.950000,26,2010-02-16,13:22:23.737000
29096923,M,25.0,67,66.0,2010-02-14,12:41:04.480000,18,2010-02-16,10:24:36.550000
29095891,M,19.0,578,25.0,2010-02-16,22:40:27.770000,6,2010-02-16,22:46:54.603000
29095890,M,19.0,651,15.0,2010-02-16,21:32:38.827000,25,2010-02-16,21:40:02.347000
...,...,...,...,...,...,...,...,...,...
53791656,M,25.0,10848,183.0,2021-12-05,08:48:48,250,2021-12-05,09:02:29
53791657,M,45.0,10008,337.0,2021-12-05,08:54:46,321,2021-12-05,09:02:31
53791658,F,29.0,10266,295.0,2021-12-05,08:50:43,180,2021-12-05,09:02:39
53791660,F,30.0,9803,272.0,2021-12-05,08:22:29,317,2021-12-05,09:02:45


In [115]:
ecobici_rides_df_sorted.reset_index(inplace=True, drop=True)
ecobici_rides_df_sorted

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,28.0,69,85.0,2010-02-16,12:42:32.160000,85,2010-02-16,12:45:37.427000
1,M,30.0,11,85.0,2010-02-16,12:53:29.950000,26,2010-02-16,13:22:23.737000
2,M,25.0,67,66.0,2010-02-14,12:41:04.480000,18,2010-02-16,10:24:36.550000
3,M,19.0,578,25.0,2010-02-16,22:40:27.770000,6,2010-02-16,22:46:54.603000
4,M,19.0,651,15.0,2010-02-16,21:32:38.827000,25,2010-02-16,21:40:02.347000
...,...,...,...,...,...,...,...,...,...
70703892,M,25.0,10848,183.0,2021-12-05,08:48:48,250,2021-12-05,09:02:29
70703893,M,45.0,10008,337.0,2021-12-05,08:54:46,321,2021-12-05,09:02:31
70703894,F,29.0,10266,295.0,2021-12-05,08:50:43,180,2021-12-05,09:02:39
70703895,F,30.0,9803,272.0,2021-12-05,08:22:29,317,2021-12-05,09:02:45


In [116]:
ecobici_rides_df_sorted.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70703897 entries, 0 to 70703896
Data columns (total 9 columns):
 #   Column                 Dtype         
---  ------                 -----         
 0   Genero_Usuario         object        
 1   Edad_Usuario           float64       
 2   Bici                   object        
 3   Ciclo_Estacion_Retiro  float64       
 4   Fecha_Retiro           datetime64[ns]
 5   Hora_Retiro            object        
 6   Ciclo_Estacion_Arribo  object        
 7   Fecha_Arribo           datetime64[ns]
 8   Hora_Arribo            object        
dtypes: datetime64[ns](2), float64(2), object(5)
memory usage: 4.7+ GB


In [117]:
# Export clean CSV file to master CSV file

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_sorted.to_csv("Gitignore-data-output-files/ecobici_ride_data-unified.csv", index=False) 

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to export unified CSV file: {round(toc - tic, 2)}')

ecobici_rides_df_sorted.head()

Time (in seconds) to export unified CSV file: 903.12


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,28.0,69,85.0,2010-02-16,12:42:32.160000,85,2010-02-16,12:45:37.427000
1,M,30.0,11,85.0,2010-02-16,12:53:29.950000,26,2010-02-16,13:22:23.737000
2,M,25.0,67,66.0,2010-02-14,12:41:04.480000,18,2010-02-16,10:24:36.550000
3,M,19.0,578,25.0,2010-02-16,22:40:27.770000,6,2010-02-16,22:46:54.603000
4,M,19.0,651,15.0,2010-02-16,21:32:38.827000,25,2010-02-16,21:40:02.347000


In [118]:
ecobici_rides_df_sorted_med = ecobici_rides_df_sorted.dropna().copy()

ecobici_rides_df_sorted_med

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,28.0,69,85.0,2010-02-16,12:42:32.160000,85,2010-02-16,12:45:37.427000
1,M,30.0,11,85.0,2010-02-16,12:53:29.950000,26,2010-02-16,13:22:23.737000
2,M,25.0,67,66.0,2010-02-14,12:41:04.480000,18,2010-02-16,10:24:36.550000
3,M,19.0,578,25.0,2010-02-16,22:40:27.770000,6,2010-02-16,22:46:54.603000
4,M,19.0,651,15.0,2010-02-16,21:32:38.827000,25,2010-02-16,21:40:02.347000
...,...,...,...,...,...,...,...,...,...
70703892,M,25.0,10848,183.0,2021-12-05,08:48:48,250,2021-12-05,09:02:29
70703893,M,45.0,10008,337.0,2021-12-05,08:54:46,321,2021-12-05,09:02:31
70703894,F,29.0,10266,295.0,2021-12-05,08:50:43,180,2021-12-05,09:02:39
70703895,F,30.0,9803,272.0,2021-12-05,08:22:29,317,2021-12-05,09:02:45


In [119]:
# Combine Fecha_Arribo and Hora_Arribo strings and convert to datetime

tic = timeit.default_timer()                            # Monitor performance

medium_list = []

for index, row in ecobici_rides_df_sorted_med.iterrows():

    if index%10000==0:medium_list.append(row)

ecobici_rides_df_med_export = pd.concat(medium_list, axis=0, ignore_index=True)

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to extract and concatenate excel files: {round(toc - tic, 2)}')
print(f'Number of rows: {len(ecobici_rides_df)}')

ecobici_rides_df_med_export

Time (in seconds) to extract and concatenate excel files: 3939.17
Number of rows: 71327653


0                          M
1                         28
2                         69
3                         85
4        2010-02-16 00:00:00
                ...         
63634    2021-12-05 00:00:00
63635               12:25:00
63636                     79
63637    2021-12-05 00:00:00
63638               12:27:08
Length: 63639, dtype: object

In [120]:
ecobici_rides_df_med_export


0                          M
1                         28
2                         69
3                         85
4        2010-02-16 00:00:00
                ...         
63634    2021-12-05 00:00:00
63635               12:25:00
63636                     79
63637    2021-12-05 00:00:00
63638               12:27:08
Length: 63639, dtype: object