# Using the MODA framework to analyze the SF 311 data

Source of data: https://data.sfgov.org/City-Infrastructure/311-Cases/vw6y-z8j6/data

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import numpy as np

## The time window to bucket samples
TIME_RANGE = '24H'

## File path (original data is ~1GB, this is a reduced version with only categories and dates)
#Original file:
#DATAPATH = "SF311_simplified.csv"

#Sample raw data:
DATAPATH = "SF_data/SF-311-categories-2018.csv"

In [2]:
#Raw data sample:
raw = pd.read_csv(DATAPATH,nrows=100)
raw.head(10)

Unnamed: 0,date,date.1,category,timestamp
0,2018-03-01 15:27:00,2018-03-01 15:27:00,Color Curb,1519918020
1,2018-03-04 17:00:00,2018-03-04 17:00:00,General Request - COUNTY CLERK,1520182800
2,2018-01-16 13:02:00,2018-01-16 13:02:00,Sidewalk or Curb,1516107720
3,2018-03-08 14:54:00,2018-03-08 14:54:00,Litter Receptacles,1520520840
4,2018-03-19 15:25:00,2018-03-19 15:25:00,Sign Repair,1521473100
5,2018-06-01 08:33:00,2018-06-01 08:33:00,Abandoned Vehicle,1527841980
6,2018-03-05 07:30:00,2018-03-05 07:30:00,General Request - COUNTY CLERK,1520235000
7,2018-01-04 12:34:00,2018-01-04 12:34:00,Tree Maintenance,1515069240
8,2018-01-26 09:18:00,2018-01-26 09:18:00,Sidewalk or Curb,1516958280
9,2018-02-04 12:33:00,2018-02-04 12:33:00,Litter Receptacles,1517747580


In [3]:
from moda.dataprep.raw_to_ts import raw_to_ts
from moda.dataprep.ts_to_range import ts_to_range

def prep_data(datapath, time_range='24H', nrows=None, min_date=None, max_date=None, usecols=None):
    """
    Takes a raw data with timestamps (date column), categories (category column) and additional columns,
    and turns it into a ranged time-series: Group the original raw data by time interval (time_range) and category.
    Result is the number of samples per category in each time range.
    :param datapath: the path to the csv file
    :param time_range: the time_range according to which the data is grouped by
    :param nrows: limits the number of rows read from the csv
    :param min_date: filters out ranges prior to min_date
    :param max_date: filters out ranges after max_date
    :param save_files: Whether to save intermediate csvs
    :returns a pd.DataFrame with one value per time_range and category.
    This value is the number of samples within this range for a specific category
    """

    if nrows is None:
        raw = pd.read_csv(datapath, usecols=usecols)
    else:
        raw = pd.read_csv(datapath, usecols=usecols, nrows=nrows)

    raw = raw.rename(columns={'Opened': 'date', 'Category': 'category'})

    # Create a time series dataframe
    ts = raw_to_ts(raw, min_date=min_date, max_date=max_date)

    # Divide time series to ranges and categories
    ranged_ts = ts_to_range(ts, time_range=time_range)

    return ranged_ts



In [4]:
# Creating a dataset from raw SF data
ranged_ts_24H_2018 = prep_data(DATAPATH,time_range=TIME_RANGE,min_date="01-01-2018")


Now that we have a file with the number of events per category per time, we can start modeling.

First, in order to be able to estimate our models, we use [TagAnomaly](https://github.com/Microsoft/TagAnomaly) to tag the points we think are showing trends in the data. Taganomaly can be found here: https://github.com/Microsoft/TagAnomaly
Second, we join the tagged dataset with the time series dataset. Each sample which isn't included in the tagged dataset is assumed to be non-trending (or normal)


In [5]:
labels24H = pd.read_csv('SF_24H_anomalies_only.csv',usecols=['date','category','value'])
labels24H.date = pd.to_datetime(labels24H.date)
labels24H.sort_values(by='date').head()

Unnamed: 0,date,category,value
52,2018-01-07,Sidewalk or Curb,34
41,2018-01-08,Sewer Issues,168
8,2018-01-08,Catch Basin Maintenance,19
24,2018-01-08,General Request - PUC,38
73,2018-01-09,Temporary Sign Request,62


In [6]:
ranged_ts_24H_2018.reset_index().head()

Unnamed: 0,date,category,value
0,2018-01-01,Abandoned Vehicle,52
1,2018-01-01,Blocked Street or SideWalk,14
2,2018-01-01,Catch Basin Maintenance,1
3,2018-01-01,Color Curb,2
4,2018-01-01,Damaged Property,18


In [8]:
df24H = pd.merge(ranged_ts_24H_2018.reset_index(),labels24H,how='left',on=['date','category'])
df24H['label'] = np.where(np.isnan(df24H['value_y']),0,1)
df24H = df24H.drop(columns = 'value_y').rename(columns = {'value_x':'value'})
#df24H.to_csv("SF24H_labeled.csv")


Sample of positive examples:

In [9]:
df24H[df24H.label > 0].head()

Unnamed: 0,date,category,value,label
237,2018-01-07,Sidewalk or Curb,34,1
247,2018-01-08,Catch Basin Maintenance,19,1
261,2018-01-08,General Request - PUC,38,1
273,2018-01-08,Sewer Issues,168,1
318,2018-01-09,Temporary Sign Request,62,1


Evaluating different models

In [11]:
from moda.evaluators.eval import get_evaluation_metrics, get_final_metrics, eval_models
from moda.models.azure_anomaly_detection.azure_ad import AzureAnomalyTrendinessDetector
from moda.models.data_reader import read_data
from moda.models.ma_seasonal.ma_seasonal_model import MovingAverageSeasonalTrendinessDetector
from moda.models.stl.stl_model import STLTrendinessDetector
from moda.models.twitter.anomaly_detect_multicategory import TwitterAnomalyTrendinessDetector


def run_model(dataset, freq, min_date='01-01-2018', plot=True, model_name='stl', min_value=10):


    if len(dataset.index.levels) > 1:
        categories = dataset.index.levels[1]

    if model_name == 'twitter':
        model = TwitterAnomalyTrendinessDetector(is_multicategory=True, freq=freq, min_value=min_value, threshold=None,
                                                 max_anoms=0.49, seasonality_freq=7)

    if model_name == 'ma_seasonal':
        model = MovingAverageSeasonalTrendinessDetector(is_multicategory=True, freq=freq, min_value=min_value,
                                                        anomaly_type='or',
                                                        num_of_std=3)

    if model_name == 'stl':
        print("Running STL model")
        model = STLTrendinessDetector(is_multicategory=True, freq=freq, min_value=min_value,
                                      anomaly_type='or',
                                      num_of_std=2.5, lo_delta=0)

    if model_name == 'azure':
        dirname = os.path.dirname(__file__)
        filename = os.path.join(dirname, 'config/config.json')
        subscription_key = get_azure_subscription_key(filename)
        model = AzureAnomalyTrendinessDetector(is_multicategory=True, freq=freq, min_value=min_value,
                                               subscription_key=subscription_key)

    prediction = model.predict(dataset, verbose=True)
    raw_metrics = get_evaluation_metrics(dataset[['value']], prediction[['prediction']], dataset[['label']],
                                         window_size_for_metrics=5)
    metrics = get_final_metrics(raw_metrics)
    print(metrics)

    ## Plot each category
    if plot:
        _, file = os.path.split(datapath)
        print("Plotting...")
        model.plot(labels=dataset['label'], postfix=file)

    return prediction


ImportError: cannot import name 'get_final_metrics'

In [32]:
df = df24H.set_index([pd.DatetimeIndex(df24H['date']), 'category']).drop(columns='date')

dataset.to_csv("notebook_dataset.csv")
prediction = run_model(df,freq='24H',model_name='stl')

Running STL model
categories found = ['Abandoned Vehicle' 'Blocked Street or SideWalk'
 'Catch Basin Maintenance' 'Color Curb' 'Damaged Property' 'Encampments'
 'General Request - ANIMAL CARE CONTROL'
 'General Request - ART COMMISSION' 'General Request - COUNTY CLERK'
 'General Request - DPH' 'General Request - MTA'
 'General Request - PLANNING' 'General Request - PUBLIC WORKS'
 'General Request - PUC' 'General Request - RPD' 'Graffiti'
 'Homeless Concerns' 'Illegal Postings' 'Litter Receptacles'
 'MUNI Feedback' 'Noise Report' 'Rec and Park Requests'
 'Residential Building Request' 'SFHA Requests' 'Sewer Issues'
 'Sidewalk or Curb' 'Sign Repair' 'Street Defects'
 'Street and Sidewalk Cleaning' 'Streetlights' 'Tree Maintenance'
 '311 External Request' 'General Request - 311CUSTOMERSERVICECENTER'
 'General Request - ASSESSOR RECORDER'
 'General Request - BUILDING INSPECTION'
 'General Request - PORT AUTHORITY'
 'General Request - REAL ESTATE DEPARTMENT'
 'General Request - TT COLLECTOR

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  output = pd.concat([output, res])


General Request - CONTROLLER
Adjusted seasonality = 7.0
General Request - CHILDREN YOUTH FAMILIES
Adjusted seasonality = 7.0
General Request - CITY ATTORNEY
Adjusted seasonality = 7.0
General Request - HSH
Adjusted seasonality = 7.0
General Request - MOCD
Adjusted seasonality = 7.0
General Request - ELECTIONS
Adjusted seasonality = 7.0
General Request - MOD
Adjusted seasonality = 7.0
General Request - MOH
Adjusted seasonality = 7.0
General Request - AGING ADULT SERVICES
Adjusted seasonality = 7.0
General Request - AIRPORT SFO
Adjusted seasonality = 7.0
General Request - ETHICS COMMISSION
Adjusted seasonality = 7.0
General Request - MEDICAL EXAMINER
Adjusted seasonality = 7.0
Parking Enforcement
Adjusted seasonality = 7.0
General Request - SHORT TERM RENTALS
Adjusted seasonality = 7.0
General Request - OCC
Adjusted seasonality = 7.0
General Request - ADULT PROBATION
Adjusted seasonality = 7.0
General Request - MOCJ
Adjusted seasonality = 7.0
Entertainment Commission
{'precision': 0.3969

NameError: name 'os' is not defined