In [7]:
from fbprophet import Prophet
import pandas as pd
import altair as alt
import plotly
from IPython.display import HTML
alt.renderers.enable('notebook')
directory = '../data/json_news_by_years_classified'
output_dir = '../data/csv_grouped_data'

In [12]:
def fit_predict_model(dataframe, interval_width = 0.99, changepoint_range = 0.8):
    m = Prophet(daily_seasonality = False, yearly_seasonality = False, weekly_seasonality = False,
                seasonality_mode = 'multiplicative', 
                interval_width = interval_width,
                changepoint_range = changepoint_range)
    m = m.fit(dataframe)
    forecast = m.predict(dataframe)
    forecast['fact'] = dataframe['y'].reset_index(drop = True)
    return forecast

def detect_anomalies(forecast):
    forecasted = forecast[['ds','trend', 'yhat', 'yhat_lower', 'yhat_upper', 'fact']].copy()
    #forecast['fact'] = df['y']

    forecasted['anomaly'] = 0
    forecasted.loc[forecasted['fact'] > forecasted['yhat_upper'], 'anomaly'] = 1
    forecasted.loc[forecasted['fact'] < forecasted['yhat_lower'], 'anomaly'] = -1

    #anomaly importances
    forecasted['importance'] = 0
    forecasted.loc[forecasted['anomaly'] ==1, 'importance'] = \
        (forecasted['fact'] - forecasted['yhat_upper'])/forecast['fact']
    forecasted.loc[forecasted['anomaly'] ==-1, 'importance'] = \
        (forecasted['yhat_lower'] - forecasted['fact'])/forecast['fact']
    
    return forecasted

def plot_anomalies(forecasted):
    interval = alt.Chart(forecasted).mark_area(interpolate="basis", color = '#7FC97F').encode(
    x=alt.X('ds:T',  title ='date'),
    y='yhat_upper',
    y2='yhat_lower',
    tooltip=['ds', 'fact', 'yhat_lower', 'yhat_upper']
    ).interactive().properties(
        title='Anomaly Detection'
    )

    fact = alt.Chart(forecasted[forecasted.anomaly==0]).mark_circle(size=15, opacity=0.7, color = 'Black').encode(
        x='ds:T',
        y=alt.Y('fact', title='sales'),    
        tooltip=['ds', 'fact', 'yhat_lower', 'yhat_upper']
    ).interactive()

    anomalies = alt.Chart(forecasted[forecasted.anomaly!=0]).mark_circle(size=30, color = 'Red').encode(
        x='ds:T',
        y=alt.Y('fact', title='sales'),    
        tooltip=['ds', 'fact', 'yhat_lower', 'yhat_upper'],
        size = alt.Size( 'importance', legend=None)
    ).interactive()

    return alt.layer(interval, fact, anomalies)\
              .properties(width=870, height=450)\
              .configure_title(fontSize=20)

In [13]:
def load_data():
    print("loading news")
    df = pd.read_csv(output_dir+"/grouped_data_day_mean.csv", sep=";")
    print(df)
    return df

In [14]:
df = load_data()
df = df[["crime_probability", "normalised_date"]]
df['normalised_date'] = pd.to_datetime(df['normalised_date'])
df['normalised_date'] = df['normalised_date'].dt.tz_convert(None)
print(df)
df.columns = ['y', 'ds']
pred = fit_predict_model(df)
pred = detect_anomalies(pred)


loading news
                normalised_date    Unnamed: 0  crime_probability  sentiment  \
0     2005-01-01 00:00:00+00:00   9855.000000           0.166289   0.166884   
1     2005-01-02 00:00:00+00:00   9828.000000           0.138094   0.283870   
2     2005-01-03 00:00:00+00:00   9801.000000           0.123617   0.170715   
3     2005-01-04 00:00:00+00:00   9774.000000           0.202801   0.075195   
4     2005-01-05 00:00:00+00:00   9747.000000           0.207076   0.147576   
...                         ...           ...                ...        ...   
5590  2020-05-02 00:00:00+00:00  50057.042965           0.063573   0.195940   
5591  2020-05-03 00:00:00+00:00  50934.437220           0.074751   0.192142   
5592  2020-05-04 00:00:00+00:00  51954.621723           0.057100   0.204034   
5593  2020-05-05 00:00:00+00:00  53333.553621           0.079166   0.185626   
5594  2020-05-06 00:00:00+00:00  53567.359551           0.017864   0.168754   

      probability_domestic_violence  t

In [15]:

plot_anomalies(pred.tail(4000))

<vega.vegalite.VegaLite at 0x10acf8310>

