In [1]:
import pandas as pd
from datetime import datetime
import os


def parse(x):
    return datetime.strptime(x, '%Y %m %d %H')


def data_preprocess(csv):
    dataset = pd.read_csv(csv, parse_dates=[['year','month',
                                'day', 'hour']], index_col=0, date_parser=parse)
    dataset.drop(['No', 'season'], axis=1, inplace=True)
    dataset.columns = ['pm_25', 'dew', 'humi', 'pres', 'temp',
                       'wind_dir', 'wind_spd', 'precipitation', 'lprec']
    dataset.index.name = 'date'
    # remove rows containing NAs
    dataset.dropna(inplace=True)
    dataset.reset_index(inplace=True)
    return dataset


def main():
    appended_data = []
    for csv in os.listdir('Data/'):
        dataset = data_preprocess('Data/'+ csv)
        processed_path = os.path.join('processedData/', 'processed_'+str(csv))
        dataset.to_csv(processed_path, index=False)
        dataset['city']= csv[:-6]
        dataset.set_index(['city','date'], inplace=True, drop=False)
        appended_data.append(dataset.pm_25)
    dataset_allcities = pd.concat(appended_data, axis=0).reset_index()
    dataset_allcities.to_csv(os.path.join('processedData/', 'processed_allcities.csv'), index=False)


In [2]:

if __name__ == '__main__':
    main()
