In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
def data_for_tableau(file_list):
    """ Merge monthly files and export a full year .csv file for visualization in Tableau """

    # move all the files to the current directory before running this code
    path = os.getcwd()
    file_list = [x for x in os.listdir(path) if x.startswith('2022')]
    df_list = []
    dim_time = pd.Series(['night']*6 + ['morning']*6 + ['afternoon']*6 + ['evening']*6)
    dim_season = pd.Series(['winter']*2 + ['spring']*3 + ['summer']*3 + ['fall']*3 + ['winter'])

    # trimming function (0.95 quantile)
    def trimmed_095(trip_data):
        outlier_value = trip_data.groupby(['member_casual', 'rideable_type'])['trip_length_(min)'].transform(lambda x: x.quantile(0.95))
        df_clean = trip_data.loc[(trip_data['trip_length_(min)'] > 0) & (trip_data['trip_length_(min)'] <= outlier_value)].copy(deep=True)
        return df_clean

    for file in file_list:
        # prepare data
        df = pd.read_csv(file, parse_dates=['started_at', 'ended_at'])
        df.drop_duplicates(inplace=True)
        df.dropna(subset=['started_at', 'ended_at'], inplace=True)
        df.sort_values(by='started_at', ascending=True, inplace=True)

        # remove outliers, invalid values and round trip time
        df['trip_length_(min)'] = (df['ended_at'] - df['started_at']).astype('timedelta64[s]') / 60
        df = trimmed_095(df)
        df['trip_length_(min)'] = df['trip_length_(min)'].round(2)
        
        # create columns for times of day and seasons
        hour = df['started_at'].dt.hour
        month = df['started_at'].dt.month
        df['part_of_day'] = dim_time.take(hour).to_numpy()
        df['season'] = dim_season.take(month-1).to_numpy()

        # rename some columns and drop unnecessary ones
        df.rename(columns={'rideable_type': 'bike_type', 'member_casual': 'customer_type'}, inplace=True)
        df = df.reindex(columns=['bike_type', 'customer_type', 'started_at', 'part_of_day', 'season', 'trip_length_(min)'])

        df_list.append(df)

    # merge all dataframes and export to .csv
    trip_clean = pd.concat(df_list, ignore_index=True)
    trip_clean.to_csv('tripdata_tableau.csv', index=False)
    
    return 'File exported successfully'

In [None]:
data_for_tableau()