In [2]:
import requests
import pandas as pd
from zipfile import ZipFile
import shutil
from shapely.ops import unary_union
from tqdm.notebook import tqdm

import os

# NYC Bikes #

In [3]:
city = 'NYC'
transp = 'Bike'
directory = "./data/"+transp+city+"/"

In [11]:
%%time


for year in range (2018,2020):
    filename =str(year)+'-citibike-tripdata.zip'
    url = "https://s3.amazonaws.com/tripdata/" + filename
    req = requests.get(url)
    url_content = req.content
    if not os.path.exists(directory):
        os.makedirs(directory)

    csv_file = open(directory + filename, 'wb')
    csv_file.write(url_content)
    csv_file.close()

https://s3.amazonaws.com/tripdata/2018-citibike-tripdata.zip
https://s3.amazonaws.com/tripdata/2019-citibike-tripdata.zip


In [12]:
%%time
for filename in os.listdir(directory):
    if filename != '.DS_Store':
        print(directory + filename)
        with ZipFile(directory + filename, 'r') as zipObject:
            listOfFileNames = zipObject.namelist()
            for fileName in listOfFileNames:
                if fileName.endswith('.csv'):
                    # Extract a single file from zip
                    zipObject.extract(fileName, directory)
            print('All the csv files are extracted')


./data/BikeNYC/2019-citibike-tripdata.zip
./data/BikeNYC/2019-citibike-tripdata.zip
All the csv files are extracted
./data/BikeNYC/2018-citibike-tripdata.zip
./data/BikeNYC/2018-citibike-tripdata.zip
All the csv files are extracted


In [13]:
for filename in os.listdir(directory):
    if filename.endswith(".zip"):
            os.remove(os.path.join(directory, filename))
    if filename == '__MACOSX':
        shutil.rmtree(os.path.join(directory, filename))
    if filename == '.DS_Store':
        os.remove(os.path.join(directory, filename))

In [20]:

def flatten_folder_structure(source_folder, target_folder):
    # Ensure the target folder exists
    os.makedirs(target_folder, exist_ok=True)
    
    # Walk through all directories and subdirectories
    for root, dirs, files in os.walk(source_folder):
        for file in files:
            # Check if the file is a CSV
            if file.endswith('.csv'):
                # Construct full file path
                file_path = os.path.join(root, file)
                # Construct target path
                target_path = os.path.join(target_folder, file)
                # Move file
                shutil.move(file_path, target_path)
                print(f'Moved: {file_path} to {target_path}')
                
    # Optional: Remove empty directories
    for root, dirs, files in os.walk(source_folder, topdown=False):
        for dir in dirs:
            dir_path = os.path.join(root, dir)
            # Remove the directory if empty
            if not os.listdir(dir_path):
                os.rmdir(dir_path)
                print(f'Removed empty directory: {dir_path}')

# Example usage
source = directory
target = directory  # If you want them in the same main folder
flatten_folder_structure(source, target)


Moved: ./data/BikeNYC/2019-citibike-tripdata/7_July/201907-citibike-tripdata_1.csv to ./data/BikeNYC/201907-citibike-tripdata_1.csv
Moved: ./data/BikeNYC/2019-citibike-tripdata/7_July/201907-citibike-tripdata_2.csv to ./data/BikeNYC/201907-citibike-tripdata_2.csv
Moved: ./data/BikeNYC/2019-citibike-tripdata/7_July/201907-citibike-tripdata_3.csv to ./data/BikeNYC/201907-citibike-tripdata_3.csv
Moved: ./data/BikeNYC/2019-citibike-tripdata/12_December/201912-citibike-tripdata_1.csv to ./data/BikeNYC/201912-citibike-tripdata_1.csv
Moved: ./data/BikeNYC/2019-citibike-tripdata/6_June/201906-citibike-tripdata_2.csv to ./data/BikeNYC/201906-citibike-tripdata_2.csv
Moved: ./data/BikeNYC/2019-citibike-tripdata/6_June/201906-citibike-tripdata_3.csv to ./data/BikeNYC/201906-citibike-tripdata_3.csv
Moved: ./data/BikeNYC/2019-citibike-tripdata/6_June/201906-citibike-tripdata_1.csv to ./data/BikeNYC/201906-citibike-tripdata_1.csv
Moved: ./data/BikeNYC/2019-citibike-tripdata/1_January/201901-citibike-

In [4]:
#in directory merge the csv files with the same 6 characters in the name

def merge_csv_files(directory):
    # Create a dictionary to store the dataframes
    dfs = {}
    # Loop through all files in the directory
    for file in os.listdir(directory):
        # Check if the file is a CSV
        if file.endswith('.csv'):
            # Extract the key
            key = file[:6]
            # Read the CSV
            df = pd.read_csv(os.path.join(directory, file))
            # Store the dataframe in the dictionary
            if key in dfs:
                dfs[key] = pd.concat([dfs[key], df], ignore_index=True)
            else:
                dfs[key] = df
    # Return the dictionary
    return dfs

# Merge the CSV files
dfs = merge_csv_files(directory)


{'201903':          tripduration                 starttime                  stoptime  \
 0                 484  2019-03-25 22:44:13.9760  2019-03-25 22:52:18.7630   
 1                 254  2019-03-25 22:44:21.9740  2019-03-25 22:48:36.0380   
 2                 699  2019-03-25 22:44:22.2130  2019-03-25 22:56:02.1460   
 3                 139  2019-03-25 22:44:22.4230  2019-03-25 22:46:41.5290   
 4                 528  2019-03-25 22:44:24.5950  2019-03-25 22:53:13.3290   
 ...               ...                       ...                       ...   
 1327955           215  2019-03-25 22:43:55.1790  2019-03-25 22:47:30.3530   
 1327956           328  2019-03-25 22:43:53.9690  2019-03-25 22:49:22.1480   
 1327957           280  2019-03-25 22:43:56.1810  2019-03-25 22:48:36.1940   
 1327958          1173  2019-03-25 22:44:08.6790  2019-03-25 23:03:41.8370   
 1327959           231  2019-03-25 22:44:09.5690  2019-03-25 22:48:00.8780   
 
          start station id               start stati

In [8]:
#delete all old csv files
for file in os.listdir(directory):
    if file.endswith('.csv'):
        os.remove(os.path.join(directory, file))
        print(f'Removed: {file}')

# Save the merged dataframes
for key, df in tqdm(dfs.items()):
    #the name of the csv must be the key + "-citibike-tripdata.csv"
    df.to_csv(os.path.join(directory, f'{key}-citibike-tripdata.csv'), index=False)
    print(f'Saved: {key}-citibike-tripdata.csv')

Removed: 201903-citibike-tripdata_2.csv
Removed: 201801-citibike-tripdata.csv
Removed: 201805-citibike-tripdata_1.csv
Removed: 201810-citibike-tripdata_1.csv
Removed: 201905-citibike-tripdata_1.csv
Removed: 201910-citibike-tripdata_1.csv
Removed: 201806-citibike-tripdata_2.csv
Removed: 201812-citibike-tripdata.csv
Removed: 201909-citibike-tripdata_1.csv
Removed: 201807-citibike-tripdata_2.csv
Removed: 201812-citibike-tripdata_1.csv
Removed: 201905-citibike-tripdata_2.csv
Removed: 201904-citibike-tripdata_2.csv
Removed: 201909-citibike-tripdata_3.csv
Removed: 201809-citibike-tripdata_2.csv
Removed: 201810-citibike-tripdata.csv
Removed: 201906-citibike-tripdata_2.csv
Removed: 201906-citibike-tripdata_3.csv
Removed: 201807-citibike-tripdata.csv
Removed: 201907-citibike-tripdata_1.csv
Removed: 201902-citibike-tripdata_1.csv
Removed: 201803-citibike-tripdata.csv
Removed: 201812-citibike-tripdata_2.csv
Removed: 201903-citibike-tripdata_1.csv
Removed: 201808-citibike-tripdata.csv
Removed: 201

In [7]:
dfs.keys()

dict_keys(['201903', '201801', '201805', '201810', '201905', '201910', '201806', '201812', '201909', '201807', '201904', '201809', '201906', '201907', '201902', '201803', '201808', '201811', '201912', '201911', '201804', '201908', '201802', '201901'])

In [9]:
#open the first file to create the dataframe
df = pd.read_csv(directory + os.listdir(directory)[0])
df

Unnamed: 0,tripduration,starttime,stoptime,start station id,start station name,start station latitude,start station longitude,end station id,end station name,end station latitude,end station longitude,bikeid,usertype,birth year,gender
0,95,2019-11-19 19:13:18.4770,2019-11-19 19:14:54.1580,350,Clinton St & Grand St,40.715595,-73.987030,307,Canal St & Rutgers St,40.714275,-73.989900,38120,Subscriber,1992,1
1,468,2019-11-19 19:13:19.4350,2019-11-19 19:21:07.8600,458,11 Ave & W 27 St,40.751396,-74.005226,3255,8 Ave & W 31 St,40.750585,-73.994685,15219,Subscriber,1997,2
2,692,2019-11-19 19:13:22.3230,2019-11-19 19:24:54.4800,164,E 47 St & 2 Ave,40.753231,-73.970325,526,E 33 St & 5 Ave,40.747659,-73.984907,41109,Subscriber,1984,2
3,403,2019-11-19 19:13:24.1570,2019-11-19 19:20:07.3500,2000,Front St & Washington St,40.702551,-73.989402,406,Hicks St & Montague St,40.695128,-73.995951,26811,Subscriber,1964,1
4,454,2019-11-19 19:13:24.1640,2019-11-19 19:20:59.0980,150,E 2 St & Avenue C,40.720874,-73.980858,438,St Marks Pl & 1 Ave,40.727791,-73.985649,38691,Subscriber,1954,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1478703,386,2019-11-19 19:13:10.4740,2019-11-19 19:19:36.9900,285,Broadway & E 14 St,40.734546,-73.990741,250,Lafayette St & Jersey St,40.724561,-73.995653,33815,Subscriber,1995,2
1478704,695,2019-11-19 19:13:10.5130,2019-11-19 19:24:46.4400,367,E 53 St & Lexington Ave,40.758281,-73.970694,2021,W 45 St & 8 Ave,40.759291,-73.988597,41144,Subscriber,1978,1
1478705,454,2019-11-19 19:13:10.6310,2019-11-19 19:20:45.2930,284,Greenwich Ave & 8 Ave,40.739017,-74.002638,3788,E 12 St & 4 Av,40.732690,-73.989740,28549,Subscriber,1994,2
1478706,298,2019-11-19 19:13:11.0750,2019-11-19 19:18:09.3520,195,Liberty St & Broadway,40.709056,-74.010434,327,Vesey Pl & River Terrace,40.715338,-74.016584,20870,Subscriber,1979,1
