# Using the MODA framework to analyze the SF 311 data

Source of data: https://data.sfgov.org/City-Infrastructure/311-Cases/vw6y-z8j6/data

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np

## The time window to bucket samples
TIME_RANGE = '1H'

## File path (original data is ~1GB, this is a reduced version with only categories and dates)
#Original file:
#DATAPATH = "SF311_simplified.csv"

#Sample data:
DATAPATH = "SF_data/SF-311-categories-2018.csv"

In [2]:
#Raw data sample:
raw = pd.read_csv(DATAPATH,nrows=100)
raw.head(10)

Unnamed: 0,date,date.1,category,timestamp
0,2018-03-01 15:27:00,2018-03-01 15:27:00,Color Curb,1519918020
1,2018-03-04 17:00:00,2018-03-04 17:00:00,General Request - COUNTY CLERK,1520182800
2,2018-01-16 13:02:00,2018-01-16 13:02:00,Sidewalk or Curb,1516107720
3,2018-03-08 14:54:00,2018-03-08 14:54:00,Litter Receptacles,1520520840
4,2018-03-19 15:25:00,2018-03-19 15:25:00,Sign Repair,1521473100
5,2018-06-01 08:33:00,2018-06-01 08:33:00,Abandoned Vehicle,1527841980
6,2018-03-05 07:30:00,2018-03-05 07:30:00,General Request - COUNTY CLERK,1520235000
7,2018-01-04 12:34:00,2018-01-04 12:34:00,Tree Maintenance,1515069240
8,2018-01-26 09:18:00,2018-01-26 09:18:00,Sidewalk or Curb,1516958280
9,2018-02-04 12:33:00,2018-02-04 12:33:00,Litter Receptacles,1517747580


In [3]:
from moda.dataprep.raw_to_ts import raw_to_ts
from moda.dataprep.ts_to_range import ts_to_range

def prep_data(datapath, time_range='24H', nrows=None, min_date=None, max_date=None, save_files=False,file_prefix = "", usecols=None):
    """
    Takes a raw data with timestamps (date column), categories (category column) and additional columns,
    and turns it into a ranged time-series: Group the original raw data by time interval (time_range) and category.
    Result is the number of samples per category in each time range.
    :param datapath: the path to the csv file
    :param time_range: the time_range according to which the data is grouped by
    :param nrows: limits the number of rows read from the csv
    :param min_date: filters out ranges prior to min_date
    :param max_date: filters out ranges after max_date
    :param save_files: Whether to save intermediate csvs
    :returns a pd.DataFrame with one value per time_range and category.
    This value is the number of samples within this range for a specific category
    """

    if nrows is None:
        raw = pd.read_csv(datapath, usecols=usecols)
    else:
        raw = pd.read_csv(datapath, usecols=usecols, nrows=nrows)

    raw = raw.rename(columns={'Opened': 'date', 'Category': 'category'})

    # Create a time series dataframe
    ts = raw_to_ts(raw, min_date=min_date, max_date=max_date)

    # Divide time series to ranges and categories
    ranged_ts = ts_to_range(ts, time_range=time_range)

    if save_files:
        if nrows is None:
            ts.to_csv("{}_ts.csv".format(file_prefix))
            ranged_ts.to_csv("{0}_ranged_ts_{1}.csv".format(file_prefix,time_range))
        else:
            ts.to_csv("{0}_ts_{1}_rows.csv".format(file_postfix,nrows))
            ranged_ts.to_csv("{0}_ranged_ts_{1}_{2}_rows.csv".format(file_prefix,time_range,nrows))
    return ranged_ts



In [4]:
# This might take some time to load
ranged_ts_24H_2018 = prep_data(DATAPATH,time_range='24H',min_date="01-01-2018",save_files = True,file_prefix = "SF")

  range_grp = ts.groupby([pd.TimeGrouper(time_range), 'category']).agg('count')


In [16]:
# This might take some time to load
ranged_ts_1H_2018 = prep_data(DATAPATH,time_range='1H',min_date="01-01-2018",save_files = True,file_prefix = "SF")

  range_grp = ts.groupby([pd.TimeGrouper(time_range), 'category']).agg('count')


Now that we have a file with the number of events per category per time, we can start modeling.

First, in order to be able to estimate our models, we use [TagAnomaly](https://github.com/Microsoft/TagAnomaly) to tag the points we think are showing trends in the data. Taganomaly can be found here: https://github.com/Microsoft/TagAnomaly
Second, we join the tagged dataset with the time series dataset. Each sample which isn't included in the tagged dataset is assumed to be non-trending (or normal)


In [17]:
labels24H = pd.read_csv('SF_24H_anomalies_only.csv',usecols=['date','category','value'])
labels24H.date = pd.to_datetime(labels24H.date)
labels24H.head()

Unnamed: 0,date,category,value
0,2018-04-17,Blocked Street or SideWalk,36
1,2018-04-18,Blocked Street or SideWalk,40
2,2018-04-19,Blocked Street or SideWalk,31
3,2018-04-20,Blocked Street or SideWalk,38
4,2018-04-17,Blocked Street or SideWalk,36


In [18]:
labels1H = pd.read_csv('SF_1H_anomalies_only.csv',usecols=['date','category','value'])
labels1H.date = pd.to_datetime(labels1H.date)
labels1H.head()

Unnamed: 0,date,category,value
0,2018-01-30 08:00:00,Abandoned Vehicle,16
1,2018-01-30 09:00:00,Abandoned Vehicle,21
2,2018-02-01 17:00:00,Abandoned Vehicle,18
3,2018-02-05 08:00:00,Abandoned Vehicle,18
4,2018-02-12 15:00:00,Abandoned Vehicle,16


In [19]:
ranged_ts_24H_2018.reset_index().head()

Unnamed: 0,date,category,value
0,2018-01-01,Abandoned Vehicle,52
1,2018-01-01,Blocked Street or SideWalk,14
2,2018-01-01,Catch Basin Maintenance,1
3,2018-01-01,Color Curb,2
4,2018-01-01,Damaged Property,18


In [20]:
ranged_ts_1H_2018.reset_index().head()

Unnamed: 0,date,category,value
0,2018-01-01,Abandoned Vehicle,1
1,2018-01-01,General Request - ANIMAL CARE CONTROL,1
2,2018-01-01,General Request - PUC,1
3,2018-01-01,Graffiti,1
4,2018-01-01,MUNI Feedback,1


In [21]:
df24H = pd.merge(ranged_ts_24H_2018.reset_index(),labels24H,how='left',on=['date','category'])
df24H['label'] = np.where(np.isnan(df24H['value_y']),0,1)
df24H = df24H.drop(columns = 'value_y').rename(columns = {'value_x':'value'})
df24H.to_csv("SF24H_labeled.csv")

df1H = pd.merge(ranged_ts_1H_2018.reset_index(),labels1H,how='left',on=['date','category'])
df1H['label'] = np.where(np.isnan(df1H['value_y']),0,1)
df1H = df1H.drop(columns = 'value_y').rename(columns = {'value_x':'value'})
df1H.to_csv("SF24H_labeled.csv")

All examples:

In [22]:
df24H.head()

Unnamed: 0,date,category,value,label
0,2018-01-01,Abandoned Vehicle,52,0
1,2018-01-01,Blocked Street or SideWalk,14,0
2,2018-01-01,Catch Basin Maintenance,1,0
3,2018-01-01,Color Curb,2,0
4,2018-01-01,Damaged Property,18,0


Positive examples:

In [23]:
df24H[df24H.label > 0].head()

Unnamed: 0,date,category,value,label
237,2018-01-07,Sidewalk or Curb,34,1
247,2018-01-08,Catch Basin Maintenance,19,1
261,2018-01-08,General Request - PUC,38,1
273,2018-01-08,Sewer Issues,168,1
318,2018-01-09,Temporary Sign Request,62,1


Evaluating different models

In [31]:
from moda.evaluators.eval import get_evaluation_metrics, get_final_metrics, eval_models
from moda.models.azure_anomaly_detection.azure_ad import AzureAnomalyTrendinessDetector
from moda.models.data_reader import read_data
from moda.models.ma_seasonal.ma_seasonal_model import MovingAverageSeasonalTrendinessDetector
from moda.models.stl.stl_model import STLTrendinessDetector
from moda.models.twitter.anomaly_detect_multicategory import TwitterAnomalyTrendinessDetector


def run_model(dataset, freq, min_date='01-01-2018', plot=True, model_name='stl', min_value=10):


    if len(dataset.index.levels) > 1:
        categories = dataset.index.levels[1]
        print("categories found = {}".format(categories))

    if model_name == 'twitter':
        model = TwitterAnomalyTrendinessDetector(is_multicategory=True, freq=freq, min_value=min_value, threshold=None,
                                                 max_anoms=0.49, seasonality_freq=7)

    if model_name == 'ma_seasonal':
        model = MovingAverageSeasonalTrendinessDetector(is_multicategory=True, freq=freq, min_value=min_value,
                                                        anomaly_type='or',
                                                        num_of_std=3)

    if model_name == 'stl':
        model = STLTrendinessDetector(is_multicategory=True, freq=freq, min_value=min_value,
                                      anomaly_type='or',
                                      num_of_std=2.5, lo_delta=0)

    if model_name == 'azure':
        dirname = os.path.dirname(__file__)
        filename = os.path.join(dirname, 'config/config.json')
        subscription_key = get_azure_subscription_key(filename)
        model = AzureAnomalyTrendinessDetector(is_multicategory=True, freq=freq, min_value=min_value,
                                               subscription_key=subscription_key)

    prediction = model.predict(dataset, verbose=True)
    raw_metrics = get_evaluation_metrics(dataset[['value']], prediction[['prediction']], dataset[['label']],
                                         window_size_for_metrics=5)
    metrics = get_final_metrics(raw_metrics)
    print(metrics)

    ## Plot each category
    if plot:
        _, file = os.path.split(datapath)
        print("Plotting...")
        model.plot(labels=dataset['label'], postfix=file)

    return prediction


ModuleNotFoundError: No module named 'moda.models.azure_anomaly_detection'

Object `moda` not found.


In [29]:
prediction = run_model(df24H,freq='24H',model_name='stl')

NameError: name 'run_model' is not defined