In [None]:
import numpy as np
import pandas as pd
from IPython.display import clear_output
import matplotlib.pyplot as plt
from matplotlib import rc
from matplotlib.animation import FuncAnimation

rc('font', size=16)

## Loading dataset

In [None]:
data_df = pd.read_csv('../raw_data/Air20Quality20Data/Beijing/CrawledData.txt')
data_df = data_df.dropna(subset=['PM25_AQI_value']) # Dropping null values on PM2.5

station_df = pd.read_csv('../raw_data/Air20Quality20Data/Beijing/Station.txt').set_index('station_id')
station_df = station_df.rename(columns={'longtitude':'longitude'}) # Small correction
print(data_df.shape)
data_df.head(2)

In [None]:
station_df.head(2)

## Data sanity check

In [None]:
plt.plot(data_df.PM25_AQI_value); # No bad data visually
assert np.all(data_df.PM25_AQI_value >= 0) # PM25 can not be negative

## Sort by time-stamp

In [None]:
data_df['time'] = pd.to_datetime(data_df['time'])
data_df_sorted = data_df.set_index('time', drop=True).sort_index()
data_df_sorted.head(2)

In [None]:
data_df_sorted.tail(2)

## Combine station data with AQ+Met data

In [None]:
for new_col in ['station_name', 'longitude', 'latitude']:
    data_df_sorted[new_col] = data_df_sorted['station_id']
    data_df_sorted[new_col] = data_df_sorted[new_col].apply(lambda x: station_df.loc[x, new_col])

data_df_sorted.head(2)

## Plotting station locations

In [None]:
plt.scatter(station_df.longitude, station_df.latitude);
print(station_df.shape)
plt.xlabel('Longitude');plt.ylabel('Latitude');

## Checking info available per time-stamp

In [None]:
count_df = pd.DataFrame(index=data_df_sorted.index.unique(), columns=['cnt'])
for idx in count_df.index:
    count_df.loc[idx, 'cnt'] = len(data_df_sorted.loc[idx])

plt.figure(figsize=(15,4))
plt.scatter(count_df.index, count_df.cnt);
plt.xlabel('time-frame');plt.ylabel('Number of stations');

## Finding longest contineous data with 36 stations

In [None]:
station_c = pd.DataFrame(index=data_df_sorted.index.unique(), columns=['station_count'])
for idx in station_c.index:
    station_c.loc[idx, 'station_count'] = len(data_df_sorted.loc[idx])
station_c.head(2)

In [None]:
delta = 24*20
for i in range(len(station_c)-delta):
    try:
        if station_c.iloc[i:i+delta]['station_count'].value_counts()[36] == delta:
            print('Best start index=', i)
            break
    except KeyError:
        pass

best_idx = station_c.index[i:i+delta]
plt.figure(figsize=(15,5))
plt.scatter(best_idx, np.ones(len(best_idx)));
plt.tight_layout();

#### Let us choose the first chunk of data from above plot.

In [None]:
first_idx = station_c.index[i]
best_idxs = station_c[first_idx:'2013-11-25'].index
print("Number of time-stamps chosen =", best_idxs.shape)
best_idxs

## Saving processed dataset

In [None]:
final_df = data_df_sorted.loc[best_idxs]
final_df.to_pickle('../processed_data/best36_beijing_pm25.pickle')
final_df.head(2)

### Visualize trend accross stations

In [None]:
double_sorted_df = final_df.sort_values(['time','station_id'])
# print(double_sorted_df.head(5))
fig, ax = plt.subplots(figsize=(10,4))
def update(ts):
    ax.cla()
    tmp_df = double_sorted_df.loc[ts]
    ax.plot(tmp_df.PM25_AQI_value.values)
    ax.set_ylim(0,500)

plt.close()
anim = FuncAnimation(fig, update, double_sorted_df.index.unique())
rc('animation',html='jshtml',embed_limit=100)
anim