In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import glob

In [2]:
# Read in one csv file to have an overview of the data
df_201912 = pd.read_csv("data/201912-citibike-tripdata.csv")
df_201912.head()

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,602,2019-12-01 00:00:05.5640,2019-12-01 00:10:07.8180,3382,Carroll St & Smith St,40.680611,-73.994758,3304,6 Ave & 9 St,40.668127,-73.983776,41932,Subscriber,1970,1
1,1206,2019-12-01 00:00:10.9630,2019-12-01 00:20:17.8820,362,Broadway & W 37 St,40.751726,-73.987535,500,Broadway & W 51 St,40.762288,-73.983362,18869,Customer,1999,1
2,723,2019-12-01 00:00:11.8180,2019-12-01 00:12:14.8310,146,Hudson St & Reade St,40.71625,-74.009106,238,Bank St & Washington St,40.736197,-74.008592,15334,Subscriber,1997,1
3,404,2019-12-01 00:00:12.2200,2019-12-01 00:06:56.8860,3834,Irving Ave & Halsey St,40.69467,-73.90663,3827,Halsey St & Broadway,40.68565,-73.91564,41692,Customer,1995,1
4,1059,2019-12-01 00:00:14.7230,2019-12-01 00:17:54.1860,500,Broadway & W 51 St,40.762288,-73.983362,3323,W 106 St & Central Park West,40.798186,-73.960591,40156,Subscriber,1961,1


In [3]:
# Define the path that includes the csv files to merge
files_to_merge = [file for file in glob.glob(r'data/*.csv')]

In [4]:
# Check the number of columns in each file
number_of_cols = list()

for file in files_to_merge:
    df = pd.read_csv(file, nrows = 0)
    number_of_cols.append(df.columns.size)
    
print(f"Maximum number of columns: {max(number_of_cols)}")
print(f"Minimum number of columns: {min(number_of_cols)}")

Maximum number of columns: 15
Minimum number of columns: 15


In [5]:
col_names = pd.concat([pd.read_csv(file, nrows = 0) for file in files_to_merge], ignore_index = True).columns
col_names

Index(['tripduration', 'starttime', 'stoptime', 'start station id',
       'start station name', 'start station latitude',
       'start station longitude', 'end station id', 'end station name',
       'end station latitude', 'end station longitude', 'bikeid', 'usertype',
       'birth year', 'gender', 'Trip Duration', 'Start Time', 'Stop Time',
       'Start Station ID', 'Start Station Name', 'Start Station Latitude',
       'Start Station Longitude', 'End Station ID', 'End Station Name',
       'End Station Latitude', 'End Station Longitude', 'Bike ID', 'User Type',
       'Birth Year', 'Gender'],
      dtype='object')

As can be seen, there are columns covering similar data but named differently, e.g. tripduration vs. Trip Duration. Hence, we'll rename them for consistency.

In [6]:
# Renamed column names
column_names = ["trip_duration", 'start_time', 'stop_time', 'start_station_id',
                'start_station_name', 'start_station_latitude',
                'start_station_longitude', 'end_station_id', 'end_station_name',
                'end_station_latitude', 'end_station_longitude',
                'bikeid', 'usertype', 'birth_year', 'gender']

In [7]:
CHUNK_SIZE = 200000

first_file = True

for file in files_to_merge:

    if first_file:
        df_chunk = pd.read_csv(file, chunksize=CHUNK_SIZE, header=0, names=column_names)
        
    # if it is not the first csv file then skip the header row (row 0) of that file
    else:
        skip_row = [0]
        df_chunk = pd.read_csv(file, chunksize=CHUNK_SIZE, skiprows=skip_row, header=None)

    # The above operation results in a TextFileReader object for iteration.
    # df_chunk is not a dataframe but an object for further operation in the next step.

    for chunk in df_chunk:
        # Append each chunk to a csv file 
        chunk.to_csv("data/conso_file.csv", mode="a", index=False)
        
    first_file = False