In [None]:
import pandas as pd
from dateutil import parser

# Import raw data from csv to DataFrame
df = pd.read_csv('data/source/collated_bike_data.csv')

# Clean NA vals to appropriate data type
df['name'] = df['name'].fillna('Unknown')
df['email'] = df['email'].fillna('Unknown')
df['birthyear'] = df['birthyear'].fillna(0)

# Remove unnecessary duplicate columns, rename mis-named or mis-cased columns
df = df.drop(['start_time_formatted', 'end_time_formatted'], axis=1)
df.rename(columns={'to_station_id.1': 'to_station_name', 'birthyear': 'birth_year', 'usertype': 'user_type', 'tripduration': 'trip_duration'}, inplace=True)

# Take length of original data set, and print first 50 records
original_len = len(df)
df.head(50)

In [None]:
# Fill empty from/to station cells with null val of appropriate data type, set ids to int

df[['from_station_id', 'from_station_name', 'to_station_id', 'to_station_name']] = (
    df[['from_station_id', 'from_station_name', 'to_station_id', 'to_station_name']]
    .fillna({'from_station_id': 0, 'from_station_name': 'Unknown', 'to_station_id': 0, 'to_station_name': 'Unknown'})
)
df[['from_station_id', 'to_station_id']] = df[['from_station_id', 'to_station_id']].astype(int)

#Check data
df.head(50)

In [None]:
# Convert date-like strings to datetime values

df['start_time'] = pd.to_datetime(df['start_time'], format='mixed')
df['end_time'] = pd.to_datetime(df['end_time'], format='mixed')

# Check data
df.head(15)

In [None]:
# Calculate tripduration in minutes from end_time/start_time, return as integer
df['trip_duration'] = (df['end_time'] - df['start_time']) / pd.Timedelta(minutes=1)
df['trip_duration'] = df['trip_duration'].astype(int)
df.head(15)

In [None]:
# Where birthyear is a string val, parse to date, and take the year, else return the original val
df['birth_year'] = (
    df['birth_year']
    .apply(lambda x: int(parser.parse(x, fuzzy=True).year) if isinstance(x, str) else x)
)
df.head(25)

In [None]:
# Helper func to set every member/casual as a Subscriber/Customer - will also capitalise Subscriber/Customers with wrong casing

def clean_user_type(user):
    return (
        user.lower()
        .replace('member', 'Subscriber')
        .replace('casual', 'Customer')
        .capitalize()
    )
df['user_type'] = df['user_type'].apply(clean_user_type)

df.head(50)

In [None]:
# Sanity check number of records is same, print, and check types

number_removed = original_len - len(df)
print(f"{number_removed} lines have been removed")
print(df.dtypes)

# Export to CSV
df.to_csv('data/target/cleaned_bike_data.csv', index=False)