In [1]:
import numpy as np
import pandas as pd
import datetime
from sklearn.preprocessing import minmax_scale

In [2]:
roads = ["A. Maceda", "Anda Circle", "Antipolo", "Bluementritt", "Buendia", "Edsa Extension", "Finance Road", "Gov. Forbes - Lacson", "Lerma", "Magsaysay Ave", "P.Noval", "Pablo Ocampo", "Pedro Gil", "Quezon Ave.", "Quirino", "Rajah Sulayman", "Taft Ave.", "U.N. Avenue", "Vicente Cruz"]

In [3]:
def normalize(data):
    data = minmax_scale(data)
    return data

In [4]:
def getWorkingDayPeakHour(dataset):
    # Duplicate dataset
    result_df = dataset.copy()

    # Converting the index as date
    result_df.index = pd.to_datetime(result_df.index)

    # Create column work_day
    result_df['work_day'] = ((result_df.index.dayofweek) < 5).astype(int)

    # Consider non-working holiday
    result_df.loc['2015-01-01', 'work_day'] = 0
    result_df.loc['2015-01-02', 'work_day'] = 0
    result_df.loc['2015-01-15', 'work_day'] = 0
    result_df.loc['2015-01-16', 'work_day'] = 0
    result_df.loc['2015-01-19', 'work_day'] = 0
    result_df.loc['2015-02-19', 'work_day'] = 0
    result_df.loc['2015-02-25', 'work_day'] = 0
    result_df.loc['2015-03-16', 'work_day'] = 0
    result_df.loc['2015-04-02', 'work_day'] = 0
    result_df.loc['2015-04-03', 'work_day'] = 0
    result_df.loc['2015-04-04', 'work_day'] = 0
    result_df.loc['2015-04-09', 'work_day'] = 0
    result_df.loc['2015-05-01', 'work_day'] = 0
    result_df.loc['2015-06-11', 'work_day'] = 0
    result_df.loc['2015-06-12', 'work_day'] = 0
    result_df.loc['2015-06-24', 'work_day'] = 0
    result_df.loc['2015-07-17', 'work_day'] = 0
    result_df.loc['2015-08-21', 'work_day'] = 0
    result_df.loc['2015-08-31', 'work_day'] = 0
    result_df.loc['2015-09-25', 'work_day'] = 0
    result_df.loc['2015-11-01', 'work_day'] = 0
    result_df.loc['2015-11-17', 'work_day'] = 0
    result_df.loc['2015-11-18', 'work_day'] = 0
    result_df.loc['2015-11-19', 'work_day'] = 0
    result_df.loc['2015-11-20', 'work_day'] = 0
    result_df.loc['2015-11-30', 'work_day'] = 0
    result_df.loc['2015-12-24', 'work_day'] = 0
    result_df.loc['2015-12-25', 'work_day'] = 0
    result_df.loc['2015-12-30', 'work_day'] = 0
    result_df.loc['2015-12-31', 'work_day'] = 0

    # # Consider class suspension
    result_df.loc['2015-01-09', 'work_day'] = 0
    result_df.loc['2015-01-14', 'work_day'] = 0
    result_df.loc['2015-03-17', 'work_day'] = 0
    result_df.loc['2015-07-06', 'work_day'] = 0
    result_df.loc['2015-07-07', 'work_day'] = 0
    result_df.loc['2015-07-08', 'work_day'] = 0
    result_df.loc['2015-07-09', 'work_day'] = 0
    result_df.loc['2015-07-10', 'work_day'] = 0
    result_df.loc['2015-07-27', 'work_day'] = 0
    result_df.loc['2015-07-28', 'work_day'] = 0
    result_df.loc['2015-07-29', 'work_day'] = 0
    result_df.loc['2015-10-02', 'work_day'] = 0
    result_df.loc['2015-10-13', 'work_day'] = 0
    result_df.loc['2015-10-19', 'work_day'] = 0
    result_df.loc['2015-10-20', 'work_day'] = 0
    result_df.loc['2015-10-30', 'work_day'] = 0
    result_df.loc['2015-10-31', 'work_day'] = 0
    result_df.loc['2015-11-02', 'work_day'] = 0
    result_df.loc['2015-11-03', 'work_day'] = 0
    result_df.loc['2015-12-04', 'work_day'] = 0
    result_df.loc['2015-12-14', 'work_day'] = 0
    result_df.loc['2015-12-15', 'work_day'] = 0
    result_df.loc['2015-12-16', 'work_day'] = 0

    result_df['peak_hour'] = 0

    # Set morning peak hour

    start = datetime.time(7,0,0)
    end = datetime.time(10,0,0)

    result_df.loc[result_df.between_time(start, end).index, 'peak_hour'] = 1

    # Set afternoon peak hour

    start = datetime.time(16,0,0)
    end = datetime.time(19,0,0)

    result_df.loc[result_df.between_time(start, end).index, 'peak_hour'] = 1

    # Parameters
    IS_WORKDAY = 1
    IS_PEAKHOUR = 1

    # Filter based on parameters
    work_day_peak_hour_df = result_df[(result_df['work_day'] == IS_WORKDAY) & (result_df['peak_hour'] == IS_PEAKHOUR)]

    # Remove work_day and peak_hour columns
    work_day_peak_hour_df.drop(['work_day', 'peak_hour'], axis=1, inplace=True)
    
    return work_day_peak_hour_df

# Get Weather Dataset

In [5]:
weather_dataset = pd.read_csv('merged_weather_manila_2015.csv', skipinitialspace=True, encoding='cp1252')
weather_dataset = weather_dataset.drop(['datetime'], axis=1)

# Merge and normalize traffic and weather dataset

In [6]:
for road in roads:
    # obtain raw data
    dataset = pd.read_csv('raw/raw_mmda_wwo_2015/raw_mmda_wwo_' + road + '_2015.csv', index_col=['dt'], skipinitialspace=True, encoding='cp1252')
    
    # ensure is datetime
    dataset.index = pd.to_datetime(dataset.index) 
    
    # get working day and peak hour
    dataset = getWorkingDayPeakHour(dataset)
    
    # save RAW to csv
    dataset.to_csv('raw/raw_mmda_wwo_2015_workingDayPeakHour/raw_mmda_wwo_' + road + '_2015_workingDayPeakHour.csv')
    
    # normalize
    columns = dataset.columns
    for column in columns:
        dataset[column] = normalize(dataset[column])
    
    # save NORMALIZED to csv
    dataset.to_csv('normalized/normalized_mmda_wwo_2015_workingDayPeakHour/normalized_mmda_wwo_' + road + '_2015_workingDayPeakHour.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
