In [14]:
import pandas as pd

# Import CSV to DataFrame
df = pd.read_csv('data/source/divvy_weather_data.csv')
original_len = len(df)
print(f"\n Number of weather records in source: {original_len}")

# Remove unused data points
df = df.drop(['EventId', 'TimeZone', 'AirportCode', 'LocationLat', 'LocationLng', 'City', 'County', 'State', 'ZipCode'], axis=1)

# Tidy naming conventions and check date string formatting (cast to datetime succeeds)
df = df.rename(columns={'StartTime(UTC)': 'start_time', 'EndTime(UTC)': 'end_time'})
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

# Filter for weather events that start after the first bike ride in the ride data set
mask = df['start_time'] > '2021-12-31 19:13:32'
df = df[mask]

# Read original data set length
df.head(10)


 Number of weather records in source: 2894


Unnamed: 0,Type,Severity,start_time,end_time
1751,Snow,Light,2022-01-01 03:20:36,2022-01-01 06:20:36
1752,Snow,Light,2022-01-01 09:20:36,2022-01-02 02:54:36
1753,Snow,Light,2022-01-02 05:20:36,2022-01-02 20:20:36
1754,Rain,Light,2022-01-07 07:20:36,2022-01-07 11:20:36
1755,Rain,Moderate,2022-01-07 11:20:36,2022-01-07 12:20:36
1756,Rain,Light,2022-01-07 12:20:36,2022-01-07 13:20:36
1757,Rain,Light,2022-01-07 13:57:36,2022-01-07 15:20:36
1758,Rain,Light,2022-01-08 10:20:36,2022-01-08 11:20:36
1759,Snow,Light,2022-01-08 22:20:36,2022-01-09 01:20:36
1760,Snow,Light,2022-01-09 05:17:36,2022-01-09 12:20:36


In [13]:
# Sanity check no records removed

number_removed = original_len - len(df)
print(f"\n Number of weather records in target: {len(df)}")
print(f"\n {number_removed} lines have been removed")


 Number of weather records in target: 1143

 1751 lines have been removed


In [9]:
# Check data formatting/types before writing to CSV
print(df.dtypes)
df.to_csv('data/target/cleaned_weather_data.csv', index=False)

Type                  object
Severity              object
start_time    datetime64[ns]
end_time      datetime64[ns]
dtype: object
