In [1]:
import os
from tqdm import tqdm
import pandas         as pd
import zipfile
import requests

from bs4              import BeautifulSoup
from time             import sleep
from datetime         import datetime
from xml.etree        import ElementTree

# Dados Estações

In [34]:
url = 'https://gbfs.lyft.com/gbfs/2.3/bay/en/station_information.json'

response = requests.get( url )

if response.status_code == 200:
    data = response.json()
    stations = data['data']['stations']
    
    df_station = pd.DataFrame( columns=['station_id', 'capacity', 'name', 'region_id', 'short_name', 'lat', 'lon'] )
    
    for station in stations:
        station.pop('rental_uris')
        df = pd.DataFrame( [station], dtype='object' )
        df_station = pd.concat( [df_station, df], ignore_index=True )
else:
    print( 'Não foi possível encontrar as informações, verifique o que pode estar ocorrendo' )

In [40]:
url = 'https://gbfs.lyft.com/gbfs/2.3/bay/en/system_regions.json'
response = requests.get( url )

if response.status_code == 200:
    data = response.json()
    
    regions = data['data']['regions']
    
    df_regions = pd.DataFrame( columns=['region_id', 'name'] )
    
    for region in regions:
        df = pd.DataFrame( [region], dtype='object' )
        df_regions = pd.concat( [df_regions, df], ignore_index=True )
        df_regions.drop_duplicates( subset=['region_id'], inplace=True)
else:
    print( 'Não foi possível encontrar as informações, verifique o que pode estar ocorrendo' )

In [42]:
df = df_station.merge( df_regions, how='inner', on='region_id' )

In [44]:
df.columns = ['station_id', 'capacity', 'station_name', 'region_id', 'short_name', 'lat', 'lon', 'address', 'region']

In [45]:
date_time = datetime.fromtimestamp( data['last_updated'] ).strftime( '%Y%m%d' )

In [46]:
df_station

Unnamed: 0,station_id,capacity,name,region_id,short_name,lat,lon,address
0,30429aed-9a47-4fd9-87fa-0db835aa8265,19,17th St at Santa Clara St,5,SJ-L13,37.343985,-121.874385,
1,46b4ef45-b06b-40eb-9fdf-9bc8ff104a4f,15,Bestor Art Park,5,SJ-Q11,37.323678,-121.874119,
2,c53990d7-f965-40f4-b305-3435e1c95a71,19,23rd St at Santa Clara St,5,SJ-M14,37.34648,-121.86857,
3,ed707a89-a68d-4921-a4cb-16c268e45a5b,23,San Fernando St at 7th St,5,SJ-M11-2,37.337122,-121.883215,
4,bae9be55-04d4-4641-9781-3d1c4b6950f1,15,Saint James Park,5,SJ-L10,37.339301,-121.889937,
...,...,...,...,...,...,...,...,...
544,1880193329986742718,4,Taraval St: 41st St to 40th St,,,37.742093,-122.498529,"3008 Taraval St, San Francisco, CA 94116, Unit..."
545,1880193329986742728,2,Quintara at 21st,,,37.748508,-122.478419,"2101 21st Ave, San Francisco, CA 94116, United..."
546,1841972814951361202,8,Stowe Lake,,,37.770938,-122.477116,"50 Stow Lake Dr, San Francisco, CA 94118, Unit..."
547,1841972814951361200,5,Bowling Green,,,37.76873,-122.459429,"Bowling Green Dr, San Francisco, CA 94118, Uni..."


In [48]:
df.to_csv( f'../data/station_info.csv', index=False )

# Dados Metereológicos

In [49]:
local_coordenate = {
    'San Francisco': ('37.7749', '-122.4194'),
    'San Jose': ('37.3394', '-121.895'),
    'Oakland': ('37.8044', '-122.2708'),
    'Berkeley': ('37.8716', '-122.2728'),
    'Emeryville': ('37.8313', '-122.2853')
}

df = pd.DataFrame()

for city, coordenate in local_coordenate.items():
    lat, lon = coordenate
    url = f'https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date=2022-01-01&end_date=2024-01-07&hourly=temperature_2m,relative_humidity_2m,precipitation,snowfall,wind_speed_10m,soil_temperature_7_to_28cm,is_day,sunshine_duration&daily=sunrise,sunset,shortwave_radiation_sum&timezone=America%2FLos_Angeles'

    response = requests.get( url )

    if response.status_code == 200:
        data = response.json()

        df_hourly = pd.DataFrame( data['hourly'] )
        df_hourly['time'] = pd.to_datetime( df_hourly['time'] )
        df_hourly['date'] = pd.to_datetime( df_hourly['time'].dt.date, format='%Y-%m-%d' )
        df_hourly['time'] = df_hourly['time'].dt.time

        df_daily = pd.DataFrame( data['daily'] )
        df_daily.rename( columns={'time': 'date'}, inplace=True )
        df_daily['date'] = pd.to_datetime( df_daily['date'], format='%Y-%m-%d' )
        
        df_raw = pd.merge( df_hourly, df_daily, on='date', how='left' )
        df_raw['city'] = city

        df = pd.concat( [df, df_raw] )

df.to_csv( '../data/weather_data.csv', index=False )

# Dados da Viagens

In [4]:
start_date = datetime.strptime('20220101', "%Y%m%d").date()
end_date = datetime.now().date()

url = 'https://s3.amazonaws.com/baywheels-data'

response = requests.get( url )

soup = BeautifulSoup( response.content, 'xml' )

files = soup.find_all('Key')

for file in tqdm(files):
    zip_file = file.text
    ref = zip_file.split('-')[0]

    zip_filepath = f'../data/{zip_file}'
    output_csv = f'../data/trip_data.csv'
    
    try:
        ref = datetime.strptime( ref, "%Y%m" ).date()

        if start_date <= ref <= end_date:
            
            download_file = f'{url}/{zip_file}'
            csv_file = zip_file[:-4]

            csv_filepath = f'../data/{csv_file}'

            if not os.path.exists( output_csv ):
                with open( output_csv, 'w' ) as f:
                    f.write( 'ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual\n' )
            
            os.system( f"wget -q -O {zip_filepath} {download_file}" )
            
            with zipfile.ZipFile( zip_filepath, 'r' ) as zip:
                zip.extract( csv_file, '../data' )

            for chunk in pd.read_csv( csv_filepath, chunksize= 50000 ):
                chunk.to_csv( output_csv, mode='a', header=False, index=False)
            
            os.remove( zip_filepath )
            os.remove( csv_filepath )

    except ValueError:
        pass

100%|███████████████████████████████████████████| 80/80 [05:39<00:00,  4.24s/it]


In [2]:
output_csv = f'../data/trip_data.csv'

df = pd.read_csv( output_csv )

df['member_casual'] = df['member_casual'].astype('category')
df['start_station_name'] = df['start_station_name'].astype('category')
df['end_station_name'] = df['end_station_name'].astype('category')
df['start_station_id'] = df['start_station_id'].astype('category')
df['end_station_id'] = df['end_station_id'].astype('category')
df['rideable_type'] = df['rideable_type'].astype('category')
df['started_at'] = pd.to_datetime( df['started_at'], format='mixed' )
df['ended_at'] = pd.to_datetime( df['ended_at'], format='mixed' )

df.to_parquet('../data/tripdata.parquet', engine='pyarrow')

del df

## Verificando as Estações

In [3]:
df = pd.read( output_csv, nrows=100000 )
df.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,0DD008BC62836D27,classic_bike,2022-01-13 19:12:23,2022-01-13 19:22:13,Washington St at Van Ness Ave,SF-E22,Natoma St at New Montgomery St,SF-G28-2,37.79298,-122.423302,37.786456,-122.399749,member
1,BE9F5C77F1BE0FBF,classic_bike,2022-01-15 15:58:55,2022-01-15 16:10:03,Washington St at Van Ness Ave,SF-E22,Natoma St at New Montgomery St,SF-G28-2,37.79298,-122.423302,37.786456,-122.399749,member
2,F0826402062D5A44,classic_bike,2022-01-07 20:34:47,2022-01-07 20:55:25,17th St at Dolores St,SF-N21,Scott St at Golden Gate Ave,SF-I19,37.763015,-122.426497,37.778999,-122.436861,member
3,EE9A8BB189061CFD,classic_bike,2022-01-31 16:38:25,2022-01-31 17:13:05,El Embarcadero at Grand Ave,OK-I9,Ninth St at Heinz Ave,BK-H3,37.808715,-122.249251,37.853907,-122.289698,member
4,772A8B545A8525C0,classic_bike,2022-01-28 17:28:22,2022-01-28 17:39:55,17th St at Dolores St,SF-N21,Octavia Blvd at Page St,SF-J22-1,37.763015,-122.426497,37.774018,-122.423809,casual


In [4]:
df.isna().sum()

ride_id                   0
rideable_type             0
started_at                0
ended_at                  0
start_station_name    12557
start_station_id      12557
end_station_name      11103
end_station_id        11103
start_lat                 0
start_lng                 0
end_lat                  56
end_lng                  56
member_casual             0
dtype: int64

In [8]:
df[df['start_station_name'].isna()]

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
1091,95B9A7847720F560,electric_bike,2022-01-18 20:14:43,2022-01-18 20:19:39,,,18th St at Noe St,SF-O19,37.76,-122.42,37.761047,-122.432642,member
1092,2A653EF4CDCE1342,electric_bike,2022-01-10 08:54:10,2022-01-10 09:13:52,,,Terry Francois Blvd at Mission Bay Blvd N,SF-L31-1,37.81,-122.42,37.771767,-122.386689,member
1093,0665F2FA09B5C666,electric_bike,2022-01-08 20:33:19,2022-01-08 20:36:18,,,Terry Francois Blvd at Mission Bay Blvd N,SF-L31-1,37.77,-122.39,37.771767,-122.386689,casual
1547,6CAE891EE8D630A5,electric_bike,2022-01-30 10:59:49,2022-01-30 11:28:04,,,48th Ave at Cabrillo St,SF-J1,37.80,-122.43,37.772954,-122.509071,casual
1548,39568245D8151714,electric_bike,2022-01-20 17:49:45,2022-01-20 17:55:13,,,Laguna St at Hayes St,SF-J21,37.78,-122.42,37.776247,-122.426203,casual
...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,D1DE1F02C8764742,electric_bike,2022-01-03 11:11:53,2022-01-03 11:15:42,,,,,37.75,-122.43,37.760000,-122.430000,member
99996,E8C064DB8DAAB4CA,electric_bike,2022-01-03 12:24:21,2022-01-03 12:31:43,,,,,37.76,-122.43,37.750000,-122.440000,member
99997,B57B57704EF7FD33,electric_bike,2022-01-11 18:19:10,2022-01-11 18:27:01,,,,,37.76,-122.43,37.750000,-122.440000,member
99998,21DECB42D48AAD89,electric_bike,2022-01-11 17:08:20,2022-01-11 17:13:26,,,,,37.75,-122.44,37.760000,-122.430000,member
