## Air Quality in Dar es Salaam Tz

In [2]:
import time
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pytz
from pymongo import MongoClient
from sklearn.metrics import mean_absolute_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.ar_model import AutoReg
from pprint import PrettyPrinter
import plotly.express as px
from sklearn.linear_model import LinearRegression
from statsmodels.tsa.arima.model import ARIMA

#### Prepare Data

Connect to MongoDB

In [None]:
client = MongoClient(host="localhost", port=27017)
db = client["air-quality"]
dar = db["dar-es-salaam"]

In [None]:
for c in db.list_collections():
    print(c["name"])

Explore

In [None]:
pp = PrettyPrinter(indent=2)
result = dar.find_one({})
pp.pprint(result)

What are the sites in the data?

In [None]:
sites = dar.distinct("metadata.site")
sites

Which sites has the largest document counts?

In [None]:
result = [{'_id': 23, 'count': dar.count_documents({"metadata.site": 23})}, {'_id': 11, 'count':  dar.count_documents({"metadata.site": 11})}]
readings_per_site = list(result)
readings_per_site

#### Import Data with the wrangle function

In [3]:
def wrangle(collection):
    results = collection.find(
        {"metadata.site": 11, "metadata.measurement": "P2"},
        projection={"P2": 1, "timestamp": 1, "_id": 0},
    )

    y = pd.DataFrame(results).set_index("timestamp")
    
    # Localize time zone
    y.index= y.index.tz_localize("UTC").tz_convert("Africa/Dar_es_Salaam")
    # Remove Outlines
    y = y[y["P2"] < 100]
    
    # Resample
    y = y["P2"].resample("1H").mean().fillna(method="ffill").to_frame()
    y = pd.Series(y["P2"])
    return y

Explore: Plots

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
y.plot(ax=ax, xlabel="Date",ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Lveles")

Rolling Averages plot:

In [None]:
fig, ax = plt.subplots(figsize=(15, 6))
y.rolling(168).mean().plot(ax=ax,xlabel="Date", ylabel="PM2.5 Level", title="Dar es Salaam PM2.5 Levels, 7-Day Rolling Average")