In [3]:
# Dependencies

import pandas as pd
import numpy as np
import glob
import os
import timeit                                # To check performance
from datetime import datetime, date, time, timedelta

In [4]:
# Export clean CSV file to master CSV file

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df = pd.read_csv("dataset_export_files/ecobici_ride_data-oneInTenThousand.csv", index_col=None, header=0)

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to export unified CSV file: {round(toc - tic, 2)}')

Time (in seconds) to export unified CSV file: 0.02


In [6]:
for col in ecobici_rides_df.columns:
    print(f'{col}, type: {type(ecobici_rides_df.loc[0, col])}')
    


Genero_Usuario, type: <class 'str'>
Edad_Usuario, type: <class 'numpy.float64'>
Bici, type: <class 'numpy.float64'>
Ciclo_Estacion_Retiro, type: <class 'numpy.float64'>
Fecha_Retiro, type: <class 'str'>
Hora_Retiro, type: <class 'str'>
Ciclo_Estacion_Arribo, type: <class 'numpy.int64'>
Fecha_Arribo, type: <class 'str'>
Hora_Arribo, type: <class 'str'>


In [7]:
ecobici_rides_df_test = ecobici_rides_df.dropna().copy()
ecobici_rides_df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7071 entries, 0 to 7070
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Genero_Usuario         7071 non-null   object 
 1   Edad_Usuario           7071 non-null   float64
 2   Bici                   7071 non-null   float64
 3   Ciclo_Estacion_Retiro  7071 non-null   float64
 4   Fecha_Retiro           7071 non-null   object 
 5   Hora_Retiro            7071 non-null   object 
 6   Ciclo_Estacion_Arribo  7071 non-null   int64  
 7   Fecha_Arribo           7071 non-null   object 
 8   Hora_Arribo            7071 non-null   object 
dtypes: float64(3), int64(1), object(5)
memory usage: 552.4+ KB


In [8]:
# Add Retiro_Timestamp column

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test["Retiro_Timestamp"] = np.nan

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to add column {round(toc - tic, 2)}')

ecobici_rides_df_test

Time (in seconds) to add column 0.0


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Retiro_Timestamp
0,M,28.0,69.0,85.0,2010-02-16,12:42:32.160000,85,2010-02-16,12:45:37.427000,
1,M,46.0,382.0,25.0,2010-03-19,08:31:16.170000,31,2010-03-19,08:37:12.663000,
2,M,42.0,1008.0,71.0,2010-03-30,20:47:13.610000,82,2010-03-30,21:04:39.033000,
3,M,66.0,652.0,73.0,2010-04-08,15:02:37.003000,54,2010-04-08,15:09:24.123000,
4,F,51.0,994.0,17.0,2010-04-15,18:57:13.127000,20,2010-04-15,19:03:19.010000,
...,...,...,...,...,...,...,...,...,...,...
7066,M,49.0,7745.0,68.0,2021-12-02,17:47:22,128,2021-12-02,18:08:18,
7067,M,39.0,11330.0,135.0,2021-12-03,09:37:51,59,2021-12-03,09:51:42,
7068,M,41.0,10403.0,41.0,2021-12-03,02:09:54,274,2021-12-03,02:24:58,
7069,F,53.0,10228.0,237.0,2021-12-04,04:31:36,208,2021-12-04,04:49:36,


In [9]:
tic = timeit.default_timer()                            # Monitor performance

for index, row in ecobici_rides_df_test.iterrows():
               
    try: 
        start = row['Fecha_Retiro'] + ' ' + row['Hora_Retiro']
        end = row['Fecha_Arribo'] + ' ' + row['Hora_Arribo']
        
        ecobici_rides_df_test.at[index, 'Retiro_Timestamp'] = pd.to_datetime(start)
        ecobici_rides_df_test.at[index, 'Usage Timestamp'] = pd.to_datetime(end)
        
        if index % 100000 == 0: print(f'We\'re up to {index}!')
               
    except: 
        print(f'ERROR: ROW {index} DROPPED')
        ecobici_rides_df_test.drop(index, inplace=True, axis=0)
                                          
toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to process data: {round(toc - tic, 2)}')

ecobici_rides_df_test

We're up to 0!
Time (in seconds) to process data: 2.07


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Retiro_Timestamp,Usage Timestamp
0,M,28.0,69.0,85.0,2010-02-16,12:42:32.160000,85,2010-02-16,12:45:37.427000,2010-02-16 12:42:32.160000,2010-02-16 12:45:37.427
1,M,46.0,382.0,25.0,2010-03-19,08:31:16.170000,31,2010-03-19,08:37:12.663000,2010-03-19 08:31:16.170000,2010-03-19 08:37:12.663
2,M,42.0,1008.0,71.0,2010-03-30,20:47:13.610000,82,2010-03-30,21:04:39.033000,2010-03-30 20:47:13.610000,2010-03-30 21:04:39.033
3,M,66.0,652.0,73.0,2010-04-08,15:02:37.003000,54,2010-04-08,15:09:24.123000,2010-04-08 15:02:37.003000,2010-04-08 15:09:24.123
4,F,51.0,994.0,17.0,2010-04-15,18:57:13.127000,20,2010-04-15,19:03:19.010000,2010-04-15 18:57:13.127000,2010-04-15 19:03:19.010
...,...,...,...,...,...,...,...,...,...,...,...
7066,M,49.0,7745.0,68.0,2021-12-02,17:47:22,128,2021-12-02,18:08:18,2021-12-02 17:47:22,2021-12-02 18:08:18.000
7067,M,39.0,11330.0,135.0,2021-12-03,09:37:51,59,2021-12-03,09:51:42,2021-12-03 09:37:51,2021-12-03 09:51:42.000
7068,M,41.0,10403.0,41.0,2021-12-03,02:09:54,274,2021-12-03,02:24:58,2021-12-03 02:09:54,2021-12-03 02:24:58.000
7069,F,53.0,10228.0,237.0,2021-12-04,04:31:36,208,2021-12-04,04:49:36,2021-12-04 04:31:36,2021-12-04 04:49:36.000


In [10]:
ecobici_rides_df_test['Duration'] = ecobici_rides_df_test['Usage Timestamp'] - ecobici_rides_df_test['Retiro_Timestamp']
ecobici_rides_df_test.head()



Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Retiro_Timestamp,Usage Timestamp,Duration
0,M,28.0,69.0,85.0,2010-02-16,12:42:32.160000,85,2010-02-16,12:45:37.427000,2010-02-16 12:42:32.160000,2010-02-16 12:45:37.427,0 days 00:03:05.267000
1,M,46.0,382.0,25.0,2010-03-19,08:31:16.170000,31,2010-03-19,08:37:12.663000,2010-03-19 08:31:16.170000,2010-03-19 08:37:12.663,0 days 00:05:56.493000
2,M,42.0,1008.0,71.0,2010-03-30,20:47:13.610000,82,2010-03-30,21:04:39.033000,2010-03-30 20:47:13.610000,2010-03-30 21:04:39.033,0 days 00:17:25.423000
3,M,66.0,652.0,73.0,2010-04-08,15:02:37.003000,54,2010-04-08,15:09:24.123000,2010-04-08 15:02:37.003000,2010-04-08 15:09:24.123,0 days 00:06:47.120000
4,F,51.0,994.0,17.0,2010-04-15,18:57:13.127000,20,2010-04-15,19:03:19.010000,2010-04-15 18:57:13.127000,2010-04-15 19:03:19.010,0 days 00:06:05.883000


In [11]:
duration_average = ecobici_rides_df_test['Duration'].describe()
duration_average

count                         7071
mean     0 days 00:45:02.089011030
std      0 days 19:09:50.615768287
min              -1 days +12:11:47
25%                0 days 00:06:32
50%         0 days 00:10:31.213000
75%                0 days 00:17:09
max               30 days 02:23:47
Name: Duration, dtype: object

In [12]:
ecobici_rides_df_test2 = ecobici_rides_df_test.copy()
ecobici_rides_df_test2

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Fecha_Retiro,Hora_Retiro,Ciclo_Estacion_Arribo,Fecha_Arribo,Hora_Arribo,Retiro_Timestamp,Usage Timestamp,Duration
0,M,28.0,69.0,85.0,2010-02-16,12:42:32.160000,85,2010-02-16,12:45:37.427000,2010-02-16 12:42:32.160000,2010-02-16 12:45:37.427,0 days 00:03:05.267000
1,M,46.0,382.0,25.0,2010-03-19,08:31:16.170000,31,2010-03-19,08:37:12.663000,2010-03-19 08:31:16.170000,2010-03-19 08:37:12.663,0 days 00:05:56.493000
2,M,42.0,1008.0,71.0,2010-03-30,20:47:13.610000,82,2010-03-30,21:04:39.033000,2010-03-30 20:47:13.610000,2010-03-30 21:04:39.033,0 days 00:17:25.423000
3,M,66.0,652.0,73.0,2010-04-08,15:02:37.003000,54,2010-04-08,15:09:24.123000,2010-04-08 15:02:37.003000,2010-04-08 15:09:24.123,0 days 00:06:47.120000
4,F,51.0,994.0,17.0,2010-04-15,18:57:13.127000,20,2010-04-15,19:03:19.010000,2010-04-15 18:57:13.127000,2010-04-15 19:03:19.010,0 days 00:06:05.883000
...,...,...,...,...,...,...,...,...,...,...,...,...
7066,M,49.0,7745.0,68.0,2021-12-02,17:47:22,128,2021-12-02,18:08:18,2021-12-02 17:47:22,2021-12-02 18:08:18.000,0 days 00:20:56
7067,M,39.0,11330.0,135.0,2021-12-03,09:37:51,59,2021-12-03,09:51:42,2021-12-03 09:37:51,2021-12-03 09:51:42.000,0 days 00:13:51
7068,M,41.0,10403.0,41.0,2021-12-03,02:09:54,274,2021-12-03,02:24:58,2021-12-03 02:09:54,2021-12-03 02:24:58.000,0 days 00:15:04
7069,F,53.0,10228.0,237.0,2021-12-04,04:31:36,208,2021-12-04,04:49:36,2021-12-04 04:31:36,2021-12-04 04:49:36.000,0 days 00:18:00


In [13]:
# Drop 6 columns now that they are empty or of no use. 

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test2.drop(['Fecha_Retiro',
                            'Hora_Retiro',
                            'Fecha_Arribo',
                            'Hora_Arribo',
                            'Retiro_Timestamp'], 
                      inplace=True, axis=1)

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test2

Processing time (in seconds): 0.0


Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,Usage Timestamp,Duration
0,M,28.0,69.0,85.0,85,2010-02-16 12:45:37.427,0 days 00:03:05.267000
1,M,46.0,382.0,25.0,31,2010-03-19 08:37:12.663,0 days 00:05:56.493000
2,M,42.0,1008.0,71.0,82,2010-03-30 21:04:39.033,0 days 00:17:25.423000
3,M,66.0,652.0,73.0,54,2010-04-08 15:09:24.123,0 days 00:06:47.120000
4,F,51.0,994.0,17.0,20,2010-04-15 19:03:19.010,0 days 00:06:05.883000
...,...,...,...,...,...,...,...
7066,M,49.0,7745.0,68.0,128,2021-12-02 18:08:18.000,0 days 00:20:56
7067,M,39.0,11330.0,135.0,59,2021-12-03 09:51:42.000,0 days 00:13:51
7068,M,41.0,10403.0,41.0,274,2021-12-03 02:24:58.000,0 days 00:15:04
7069,F,53.0,10228.0,237.0,208,2021-12-04 04:49:36.000,0 days 00:18:00


In [91]:
ecobici_rides_df_test2 = ecobici_rides_df_test2.rename(columns={"Arribo_Timestamp": "Usage_Timestamp"})
ecobici_rides_df_test2

Unnamed: 0,Genero_Usuario,Edad_Usuario,Bici,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,Usage_Timestamp,Duration
0,M,28.0,69,85.0,85,2010-02-16 12:45:37.427,0 days 00:03:05.267000
1,M,35.0,694,78.0,63,2010-02-19 13:52:30.700,0 days 00:12:08.983000
2,M,34.0,35,85.0,23,2010-02-21 14:30:46.390,0 days 00:26:48.800000
3,M,46.0,474,20.0,17,2010-02-22 16:17:10.470,0 days 00:06:23.507000
4,F,28.0,490,74.0,46,2010-02-23 18:51:57.057,0 days 00:14:56.160000
...,...,...,...,...,...,...,...
707034,M,27.0,7221.0,316.0,54,2021-12-05 09:29:40.000,0 days 00:13:21
707035,M,49.0,8699.0,158.0,83,2021-12-05 09:24:30.000,0 days 00:10:46
707036,M,33.0,8805.0,417.0,313,2021-12-05 08:55:46.000,0 days 00:12:03
707037,F,23.0,15286.0,467.0,260,2021-12-05 08:54:12.000,0 days 00:33:40


In [14]:
# Drop 6 columns now that they are empty or of no use. 

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test2.drop(['Bici'], 
                      inplace=True, axis=1)

toc = timeit.default_timer() 
print(f'Processing time (in seconds): {round(toc - tic, 2)}')

ecobici_rides_df_test2

Processing time (in seconds): 0.0


Unnamed: 0,Genero_Usuario,Edad_Usuario,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,Usage Timestamp,Duration
0,M,28.0,85.0,85,2010-02-16 12:45:37.427,0 days 00:03:05.267000
1,M,46.0,25.0,31,2010-03-19 08:37:12.663,0 days 00:05:56.493000
2,M,42.0,71.0,82,2010-03-30 21:04:39.033,0 days 00:17:25.423000
3,M,66.0,73.0,54,2010-04-08 15:09:24.123,0 days 00:06:47.120000
4,F,51.0,17.0,20,2010-04-15 19:03:19.010,0 days 00:06:05.883000
...,...,...,...,...,...,...
7066,M,49.0,68.0,128,2021-12-02 18:08:18.000,0 days 00:20:56
7067,M,39.0,135.0,59,2021-12-03 09:51:42.000,0 days 00:13:51
7068,M,41.0,41.0,274,2021-12-03 02:24:58.000,0 days 00:15:04
7069,F,53.0,237.0,208,2021-12-04 04:49:36.000,0 days 00:18:00


In [15]:
ecobici_rides_df_test2[['Edad_Usuario', 'Ciclo_Estacion_Retiro', 'Ciclo_Estacion_Arribo']] = ecobici_rides_df_test2[['Edad_Usuario', 'Ciclo_Estacion_Retiro', 'Ciclo_Estacion_Arribo']].astype(int)
ecobici_rides_df_test2

Unnamed: 0,Genero_Usuario,Edad_Usuario,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,Usage Timestamp,Duration
0,M,28,85,85,2010-02-16 12:45:37.427,0 days 00:03:05.267000
1,M,46,25,31,2010-03-19 08:37:12.663,0 days 00:05:56.493000
2,M,42,71,82,2010-03-30 21:04:39.033,0 days 00:17:25.423000
3,M,66,73,54,2010-04-08 15:09:24.123,0 days 00:06:47.120000
4,F,51,17,20,2010-04-15 19:03:19.010,0 days 00:06:05.883000
...,...,...,...,...,...,...
7066,M,49,68,128,2021-12-02 18:08:18.000,0 days 00:20:56
7067,M,39,135,59,2021-12-03 09:51:42.000,0 days 00:13:51
7068,M,41,41,274,2021-12-03 02:24:58.000,0 days 00:15:04
7069,F,53,237,208,2021-12-04 04:49:36.000,0 days 00:18:00


In [16]:
tic = timeit.default_timer()                            # Monitor performance

for index, row in ecobici_rides_df_test.iterrows():
               
    try: 
        
        ecobici_rides_df_test2.at[index, 'Duration - minutes'] = round(pd.Timedelta(ecobici_rides_df_test2.loc[index, 'Duration']).seconds / 60, 1)
        
        if index % 100000 == 0: print(f'We\'re up to {index}!')
               
    except: 
        print(f'ERROR: ROW {index} DROPPED')

                                          
toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to process data: {round(toc - tic, 2)}')

ecobici_rides_df_test2

We're up to 0!
Time (in seconds) to process data: 0.76


Unnamed: 0,Genero_Usuario,Edad_Usuario,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,Usage Timestamp,Duration,Duration - minutes
0,M,28,85,85,2010-02-16 12:45:37.427,0 days 00:03:05.267000,3.1
1,M,46,25,31,2010-03-19 08:37:12.663,0 days 00:05:56.493000,5.9
2,M,42,71,82,2010-03-30 21:04:39.033,0 days 00:17:25.423000,17.4
3,M,66,73,54,2010-04-08 15:09:24.123,0 days 00:06:47.120000,6.8
4,F,51,17,20,2010-04-15 19:03:19.010,0 days 00:06:05.883000,6.1
...,...,...,...,...,...,...,...
7066,M,49,68,128,2021-12-02 18:08:18.000,0 days 00:20:56,20.9
7067,M,39,135,59,2021-12-03 09:51:42.000,0 days 00:13:51,13.8
7068,M,41,41,274,2021-12-03 02:24:58.000,0 days 00:15:04,15.1
7069,F,53,237,208,2021-12-04 04:49:36.000,0 days 00:18:00,18.0


In [17]:
ecobici_rides_df_test2.drop(['Duration'], 
                      inplace=True, axis=1)

toc = timeit.default_timer() 

In [18]:
ecobici_rides_df_test2 = ecobici_rides_df_test2.rename(columns={"Usage Timestamp": "Usage_Timestamp",
                                                               'Duration - minutes': 'Duration(Min)'})
ecobici_rides_df_test2

Unnamed: 0,Genero_Usuario,Edad_Usuario,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,Usage_Timestamp,Duration(Min)
0,M,28,85,85,2010-02-16 12:45:37.427,3.1
1,M,46,25,31,2010-03-19 08:37:12.663,5.9
2,M,42,71,82,2010-03-30 21:04:39.033,17.4
3,M,66,73,54,2010-04-08 15:09:24.123,6.8
4,F,51,17,20,2010-04-15 19:03:19.010,6.1
...,...,...,...,...,...,...
7066,M,49,68,128,2021-12-02 18:08:18.000,20.9
7067,M,39,135,59,2021-12-03 09:51:42.000,13.8
7068,M,41,41,274,2021-12-03 02:24:58.000,15.1
7069,F,53,237,208,2021-12-04 04:49:36.000,18.0


In [20]:
# Export clean CSV file to master CSV file

tic = timeit.default_timer()                            # Monitor performance

ecobici_rides_df_test2.to_csv("dataset_export_files/ecobici_ride_data-oneInTenThousandv2.csv", index=False) 

toc = timeit.default_timer()                            # Monitor performance
print(f'Time (in seconds) to export CSV file: {round(toc - tic, 2)}')

ecobici_rides_df_test2.head()

Time (in seconds) to export CSV file: 0.03


Unnamed: 0,Genero_Usuario,Edad_Usuario,Ciclo_Estacion_Retiro,Ciclo_Estacion_Arribo,Usage_Timestamp,Duration(Min)
0,M,28,85,85,2010-02-16 12:45:37.427,3.1
1,M,46,25,31,2010-03-19 08:37:12.663,5.9
2,M,42,71,82,2010-03-30 21:04:39.033,17.4
3,M,66,73,54,2010-04-08 15:09:24.123,6.8
4,F,51,17,20,2010-04-15 19:03:19.010,6.1


# Continue exploration

- fdsafes
- fdsafd  