In [1]:
# Dependencies

import pandas as pd
import numpy as np
import glob
import os
import timeit                                # To check performance

# Instructions: 

The unified CSV file is nearly 5GBs. Github can't store it. If you try to push it, it will produce an error. 

If you want to work with the data, you can run this script, and it will save the unified CSV file in the sub folder "Gitignore-data-output-files."

BEFORE doing that, open your .gitignore file in the repository, and make sure the following is one of the line items to ignore:

"# Huge data files to ignore:

Resources/Data/Gitignore-data-output-files/ecobici_ride_data-unified.csv"

Save your .gitignore file, push that change. Then you should be able to produce the unified CSV file without worry of your machine trying to push it to the repository. -Ken

In [2]:
# Concatenate csv files

tic = timeit.default_timer()                            # Monitor performance

path = r'/Users/kennethandersen/Documents/GitHub/P2-Ecobici_insights_and_recommendations/Resources/Data'
all_files = glob.glob(path + "/*.csv")

csv_file_list = []
i = 1

for filename in all_files:
    try:
        df = pd.read_csv(filename, index_col=None, header=0)
        csv_file_list.append(df)
#         print (f'File {i} added')
    except:
        print(f'Concatenate rror {filename}, could not be added')
        pass
    i+= 1

ecobici_rides_df = pd.concat(csv_file_list, axis=0, ignore_index=True)

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to extract and concatenate excel files: {round(toc - tic, 2)}')
print(f'Number of rows: {len(ecobici_rides_df)}')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Concatenate rror /Users/kennethandersen/Documents/GitHub/P2-Ecobici_insights_and_recommendations/Resources/Data/2020-12.csv, could not be added


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Time (in seconds) to extract and concatenate excel files: 81.35
Number of rows: 71070670


In [3]:
# Display dataframe
ecobici_rides_df

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71070665,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,,,,
71070666,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,,,,
71070667,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,,,,
71070668,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,,,,


## Notes

- One file (2020-12.csv) failed entirely. 
- There are some inconsistences in columns that lead to 6 extra columns on the right.
- Data is captured inconsistently (not that date stamps have different formatting, for example. 

We'll deal with those below in the Data Cleanup section. 

In [4]:
# Export unified CSV file to master CSV file for backup

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df.to_csv("Gitignore-data-output-files/ecobici_ride_data-unified.csv", index=False) 

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to export unified CSV file: {round(toc - tic, 2)}')


Time (in seconds) to export unified CSV file: 316.09


# Data Cleanup

The intention of following fields is to understand where the extra columns come from, and how to clean up data. 

In [44]:
# Create a copy to experiment with

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test = ecobici_rides_df.copy()

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test

Processing time (in seconds): 37.06


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71070665,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,,,,
71070666,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,,,,
71070667,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,,,,
71070668,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,,,,


In [6]:
# Explore what column "Ciclo_EstacionArribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Ciclo_Estacion_Arribo was registered
# as Ciclo_EstacionArribo (missing underscore)

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Ciclo_EstacionArribo"])

toc = timeit.default_timer() 
print(f'{len(filtered_df)} rows')# Monitor performance
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

1445219 rows
Processing time (in seconds): 45.67


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
41819667,M,28.0,7170,217.0,28/02/2021,11:51:53,,,,6.0,01/03/2021,12:00:2,,,
41819668,M,40.0,6980,62.0,28/02/2021,11:00:41,,,,62.0,01/03/2021,12:03:33,,,
41819669,M,26.0,15303,257.0,28/02/2021,11:52:09,,,,257.0,01/03/2021,12:04:16,,,
41819670,F,73.0,11960,36.0,28/02/2021,11:59:53,,,,57.0,01/03/2021,12:04:39,,,
41819671,M,48.0,12401,53.0,01/03/2021,12:01:50,,,,38.0,01/03/2021,12:05:37,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58572207,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58572208,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58572209,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58572210,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [8]:
# Explore what column "Fecha Arribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Fecha_Arribo was registered
# as Fecha Arribo (missing underscore)

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Fecha Arribo"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 3.13


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
41819667,M,28.0,7170,217.0,28/02/2021,11:51:53,,,,6.0,01/03/2021,12:00:2,,,
41819668,M,40.0,6980,62.0,28/02/2021,11:00:41,,,,62.0,01/03/2021,12:03:33,,,
41819669,M,26.0,15303,257.0,28/02/2021,11:52:09,,,,257.0,01/03/2021,12:04:16,,,
41819670,F,73.0,11960,36.0,28/02/2021,11:59:53,,,,57.0,01/03/2021,12:04:39,,,
41819671,M,48.0,12401,53.0,01/03/2021,12:01:50,,,,38.0,01/03/2021,12:05:37,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58572207,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58572208,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58572209,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58572210,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [10]:
# Explore what column "Hora Arribo" comes from

# CONCLUSION February, March and April 2021 had an error and the column Hora_Arribo was registered
# as Fecha Arribo (missing underscore)


tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Hora Arribo"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 3.16


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
41819667,M,28.0,7170,217.0,28/02/2021,11:51:53,,,,6.0,01/03/2021,12:00:2,,,
41819668,M,40.0,6980,62.0,28/02/2021,11:00:41,,,,62.0,01/03/2021,12:03:33,,,
41819669,M,26.0,15303,257.0,28/02/2021,11:52:09,,,,257.0,01/03/2021,12:04:16,,,
41819670,F,73.0,11960,36.0,28/02/2021,11:59:53,,,,57.0,01/03/2021,12:04:39,,,
41819671,M,48.0,12401,53.0,01/03/2021,12:01:50,,,,38.0,01/03/2021,12:05:37,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58572207,F,26.0,12338,53.0,30/04/2021,11:54:07,,,,68.0,30/04/2021,11:58:22,,,
58572208,M,31.0,11398,159.0,30/04/2021,11:55:10,,,,143.0,30/04/2021,11:58:57,,,
58572209,M,64.0,11129,233.0,30/04/2021,11:28:47,,,,27.0,30/04/2021,11:59:43,,,
58572210,M,29.0,8661,297.0,30/04/2021,11:48:21,,,,405.0,30/04/2021,11:59:48,,,


In [11]:
# Take all the data from 'Ciclo_EstacionArribo','Fecha Arribo', 'Hora Arribo' columns and shift it 3 columns
#  to the left

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test.iloc[41819667:58572212,9:12] = ecobici_rides_df_test.iloc[41819667:58572212,9:12].shift(-3,axis=1)
df

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test


Processing time (in seconds): 5.86


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71070665,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,,,,
71070666,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,,,,
71070667,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,,,,
71070668,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,,,,


In [14]:
# Test to see if shift worked

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Fecha Arribo"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 2.96


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID


In [15]:
# Explore what column "Hora_Retiro.1" comes from

# CONCLUSION In May and June 2021 the column Hora_Retiro was duplicated. It can be deleted 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Hora_Retiro.1"])

toc = timeit.default_timer() 

print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 3.09


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
48247850,,27.0,10961,141.0,31/05/2021,23:52:15,,,,,,,00:02:24,,
48247851,M,41.0,10381,254.0,31/05/2021,23:41:31,,,,,,,00:01:08,,
48247852,F,30.0,12040,49.0,31/05/2021,23:44:32,,,,,,,00:10:21,,
48247853,M,47.0,7709,153.0,31/05/2021,23:56:10,,,,,,,00:01:14,,
48247854,M,30.0,10255,435.0,31/05/2021,23:50:45,,,,,,,00:06:10,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48613841,M,33.0,7199,21.0,30/06/2021,23:24:15,,,,,,,23:31:08,,
48613842,M,42.0,12232,295.0,30/06/2021,23:29:29,,,,,,,23:48:45,,
48613843,M,37.0,10093,44.0,30/06/2021,23:29:07,,,,,,,23:39:03,,
48613844,,39.0,10558,54.0,30/06/2021,22:50:14,,,,,,,23:06:11,,


In [16]:
# Explore what column "Unnamed: 9" comes from

# CONCLUSION Unclear where this comes from, doesn't seem to matter. Only one row seems to be affected and it 
# seems to have an error in the Fecha_arribo cell. Both the row and the column can can be deleted 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["Unnamed: 9"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 3.07


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
51668973,M,44.0,9944,267.0,06/03/2019,08:06:23,32:11.8,10,00:00:00,,,,,08:41:11,


In [17]:
# Remove problematic row index 51668973
ecobici_rides_df_test.drop(51668973, inplace=True, axis=0)
ecobici_rides_df_test

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17,,,,,,
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23,,,,,,
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47,,,,,,
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51,,,,,,
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71070665,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000,,,,,,
71070666,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000,,,,,,
71070667,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000,,,,,,
71070668,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000,,,,,,


In [18]:
# Explore what column "BikeID" comes from

# CONCLUSION: BikeID was only used in August 2020 and doesn't seem useful for this project. It can be deleted. 

tic = timeit.default_timer()                            # Monitor performance

filtered_df = ecobici_rides_df_test.dropna(axis=0, subset=["BikeID"])

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

filtered_df

Processing time (in seconds): 1.85


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Ciclo_EstacionArribo,Fecha Arribo,Hora Arribo,Hora_Retiro.1,Unnamed: 9,BikeID
60940913,M,25.0,4160,465.0,01/08/2020,6:08:43,463,01/08/2020,6:14:31,,,,,,4458.0
60940914,M,41.0,8057,167.0,01/08/2020,6:40:28,286,01/08/2020,6:44:41,,,,,,8328.0
60940915,,29.0,11018,208.0,01/08/2020,6:39:21,209,01/08/2020,6:48:36,,,,,,11502.0
60940916,M,43.0,12531,156.0,01/08/2020,7:44:01,363,01/08/2020,8:05:47,,,,,,14103.0
60940917,F,41.0,9746,474.0,01/08/2020,7:55:46,211,01/08/2020,8:09:25,,,,,,10207.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61207848,,25.0,2053,355.0,31/08/2020,19:10:10,359,31/08/2020,19:18:10,,,,,,2097.0
61207849,,27.0,6932,355.0,31/08/2020,19:03:42,154,31/08/2020,19:22:26,,,,,,7298.0
61207850,M,35.0,7184,382.0,31/08/2020,19:04:07,382,31/08/2020,19:24:55,,,,,,7537.0
61207851,F,31.0,9832,195.0,31/08/2020,18:57:51,319,31/08/2020,19:38:07,,,,,,10294.0


In [19]:
# Drop 6 columns now that they are empty or of no use. 

ecobici_rides_df_test.drop(['Ciclo_EstacionArribo', 
                       'Fecha Arribo', 
                       'Hora Arribo', 
                       'Hora_Retiro.1', 
                       'Unnamed: 9', 
                       'BikeID'], 
                      inplace=True, axis=1)
ecobici_rides_df_test

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,01/02/2020,0:00:38,116,01/02/2020,0:35:17
1,M,22.0,12083,66.0,01/02/2020,0:00:53,37,01/02/2020,0:06:23
2,M,29.0,11562,331.0,01/02/2020,0:00:55,341,01/02/2020,0:26:47
3,M,27.0,10206,164.0,01/02/2020,0:01:18,35,01/02/2020,0:16:51
4,M,27.0,10101,120.0,01/02/2020,0:01:18,47,01/02/2020,0:12:39
...,...,...,...,...,...,...,...,...,...
71070665,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-31,01:42:19.887000
71070666,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-31,00:01:11.147000
71070667,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-31,00:01:42.333000
71070668,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-31,00:01:19.100000


In [20]:
# Export clean CSV file to master CSV file

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test.to_csv("Gitignore-data-output-files/ecobici_ride_data-unified.csv", index=False) 

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to export unified CSV file: {round(toc - tic, 2)}')


Time (in seconds) to export unified CSV file: 228.69


In [35]:
# Determine data types in each column

for col in ecobici_rides_df_test.columns:
    print(f'{col}, type: {type(ecobici_rides_df_test.loc[0, col])}')
    

Genero_Usuario, type: <class 'str'>
Edad_Usuario, type: <class 'numpy.float64'>
Bici, type: <class 'int'>
Ciclo_Estacion_Retiro, type: <class 'numpy.float64'>
Fecha_Retiro, type: <class 'str'>
Hora_Retiro, type: <class 'str'>
Ciclo_Estacion_Arribo, type: <class 'int'>
Fecha_Arribo, type: <class 'str'>
Hora_Arribo, type: <class 'str'>


In [37]:
# Convert the Fecha columns into datetime

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test['Fecha_Arribo'] = pd.to_datetime(ecobici_rides_df_test['Fecha_Retiro'])

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to convert column data type: {round(toc - tic, 2)}')
print (type(ecobici_rides_df_test.loc[0, 'Fecha_Arribo']) )

ecobici_rides_df_test

Time (in seconds) to convert column data type: 9.99
<class 'str'>


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo
0,M,44.0,4357,442.0,2020-01-02,0:00:38,116,2020-01-02,0:35:17
1,M,22.0,12083,66.0,2020-01-02,0:00:53,37,2020-01-02,0:06:23
2,M,29.0,11562,331.0,2020-01-02,0:00:55,341,2020-01-02,0:26:47
3,M,27.0,10206,164.0,2020-01-02,0:01:18,35,2020-01-02,0:16:51
4,M,27.0,10101,120.0,2020-01-02,0:01:18,47,2020-01-02,0:12:39
...,...,...,...,...,...,...,...,...,...
71070665,M,25.0,474,12.0,2011-07-30,23:44:04.383000,20,2011-07-30,01:42:19.887000
71070666,F,37.0,476,76.0,2011-07-30,23:55:36.703000,64,2011-07-30,00:01:11.147000
71070667,M,25.0,491,48.0,2011-07-30,23:56:50.237000,26,2011-07-30,00:01:42.333000
71070668,M,31.0,868,76.0,2011-07-30,23:56:51.347000,64,2011-07-30,00:01:19.100000


In [47]:
for i in range(len(ecobici_rides_df_test):
    try: i['Timestamp'] = pd.to_datetime(ecobici_rides_df_test.loc[row,'Fecha_Retiro'] + ' ' + row.loc[:,'Hora_Retiro'] )
    except: 
        ecobici_rides_df_test.drop(row, inplace=True, axis=0)
        print(f'Row {row} dropped')
                                          
ecobici_rides_df_test                                                 

KeyError: "['Genero_Usuario'] not found in axis"