# Data processing

In [None]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

In [None]:
cd '/content/drive/MyDrive/Class/Energy Technology and Management/Topic 05 - Project'

In [None]:
!ls

# Read data

In [None]:
import numpy as np
import pandas as pd
df_all = pd.read_csv('city_day.csv')

In [None]:
df_all.head()

# Filter data

In [None]:
filt = df_all['City'] == 'Delhi'
df_delhi = df_all.loc[filt].copy()
df_delhi.head()

# Choose particular columns

In [None]:
df_delhi = df_delhi[['Date','AQI']]
df_delhi.head()

# Rename columns to 'datetime' and 'y'

In [None]:
df_delhi = df_delhi.rename(columns={'AQI': 'y', 'Date':'datetime'})
df_delhi.head()

# Change date (string) column into DateTime index

In [None]:
# Set index
df_delhi['datetime'] = pd.to_datetime(df_delhi['datetime'])
df = df_delhi.set_index('datetime')
df.head()

# Resample data
- https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#dateoffset-objects

In [None]:
df_avg = df['y'].resample('D').mean()
df_avg = pd.DataFrame(df_avg)
df_avg.head()

# Treating missing data

In [None]:
df_missing = df_avg[df_avg.isna().any(axis=1)]
df_missing.shape

In [None]:
import matplotlib.pyplot as plt
x_missing = df_missing.index
fig, ax = plt.subplots(figsize=(10,3))
df_avg.plot(ax=ax)
ax.vlines(x_missing,0,800, color='r')
plt.show()

In [None]:
df_rolling = df_avg['y'].rolling(24,min_periods=1,).mean()

# Plotting
fig, ax = plt.subplots(figsize=(10,3))
df_avg.plot(ax=ax)
df_rolling.plot(ax=ax)
plt.show()

In [None]:
df_avg['y'] = df_avg['y'].fillna(df_rolling)

In [None]:
# No more missing values
df_avg.info()

# Wrtie to file

In [None]:
df_avg.to_csv('data_processed.csv')

# Read from file

In [None]:
df_read = pd.read_csv('data_processed.csv', parse_dates=['datetime'], index_col='datetime')

In [None]:
df_read.head()