In [None]:
import pandas as pd

# Import CSV to DataFrame
df = pd.read_csv('data/source/divvy_weather_data.csv')
original_len = len(df)
print(f"\n Number of weather records in source: {original_len}")

# Remove unused data points
df = df.drop(['EventId', 'TimeZone', 'AirportCode', 'LocationLat', 'LocationLng', 'City', 'County', 'State', 'ZipCode'], axis=1)

# Tidy naming conventions and check date string formatting (cast to datetime succeeds)
df = df.rename(columns={'StartTime(UTC)': 'start_time', 'EndTime(UTC)': 'end_time'})
df['start_time'] = pd.to_datetime(df['start_time'])
df['end_time'] = pd.to_datetime(df['end_time'])

# Filter for weather events that start after the first bike ride in the ride data set
mask = df['start_time'] > '2021-12-31 19:13:32'
df = df[mask]

# Read original data set length
df.head(10)

In [None]:
# Sanity check no records removed

number_removed = original_len - len(df)
print(f"\n Number of weather records in target: {len(df)}")
print(f"\n {number_removed} lines have been removed")

In [None]:
# Check data formatting/types before writing to CSV
print(df.dtypes)
df.to_csv('data/target/cleaned_weather_data.csv', index=False)