### Objectifs de ce notebook :
>Utilisation des données du site https://data.cityofchicago.org/ sur la criminalité

## Prédiction des crimes par type et par région 
- Le nombre de crime est vu comme une série temporelle
- Utilisation de la librairie PROPHET https://facebook.github.io/prophet/docs/quick_start.html
- Visualisation des résultats

In [33]:
%load_ext autoreload
%autoreload 2

### Import des package

In [34]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from prophet.serialize import model_to_json, model_from_json
from prophet import Prophet
import joblib
from pandas.tseries.offsets import MonthEnd

## Chemin des données

In [35]:
Path_Socio = "/home/mlou/Chicago_Crime/data/raw/Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012.csv"
Path_Crime = "/home/mlou/Chicago_Crime/data/raw/Crimes_-_2001_to_Present_20231130.csv"

## Chargment des données

In [36]:
df_Socio = pd.read_csv(Path_Socio)
df_Crime = pd.read_csv(Path_Crime, parse_dates=['Date'])

  df_Crime = pd.read_csv(Path_Crime, parse_dates=['Date'])


In [37]:
start_date = df_Crime.Date.min()

In [38]:
end_date = df_Crime.Date.max()

In [39]:
start_date

Timestamp('2019-11-23 00:00:00')

In [40]:
end_date

Timestamp('2023-11-22 00:00:00')

In [41]:
def rename_columns_socio():
    """
    
    """
    return {
'Community Area Number':'community_area_number',
'COMMUNITY AREA NAME':'community_area_name', 
'PERCENT OF HOUSING CROWDED':'pct_housing_crowded', 
'PERCENT HOUSEHOLDS BELOW POVERTY':'pct_households_below_poverty', 
'PERCENT AGED 16+ UNEMPLOYED':'pct_age16_unemployed',
'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA':'pct_age25_no_highschool',
'PERCENT AGED UNDER 18 OR OVER 64': 'pct_not_working_age',
'per_capita_income':'per_capita_income',
'HARDSHIP INDEX' : 'hardship_index'}

def rename_columns_crimes():
    """
    
    """
    return {
        'ID': 'id',
        'Case Number': 'cas_number', 
        'Date':'date',
        'Block':'block', 
        'IUCR':'iucr', 
        'Primary Type':'primary_type',
        'Description':'description', 
        'Location Description':'location_description', 
        'Arrest':'arrest', 
        'Domestic':'domestic', 
        'Beat':'beat',
        'District':'district', 
        'Ward':'ward', 
        'Community Area': 'community_area_number', 
        'FBI Code':'fbi_code', 
        'X Coordinate':'x_coordinate',
        'Y Coordinate':'y_coordinate', 
        'Year':'year', 
        'Updated On':'updated_on', 
        'Latitude':'latitude', 
        'Longitude':'longitude',
        'Location':'location'   
    }

df_Socio.rename(columns=rename_columns_socio(), inplace=True)
df_Crime.rename(columns=rename_columns_crimes(), inplace=True)

In [42]:
df_src = pd.merge(df_Crime, df_Socio, on='community_area_number', how='left')
df_src.drop(df_src.columns.difference(['primary_type','date', 'community_area_name']), inplace=True, axis=1)

In [43]:
list_date = df_src['date'].apply(lambda x: x.strftime('%Y-%m'))

In [44]:
list_primary_type = list(df_src.primary_type.unique())
list_community_area = list(df_src.community_area_name.unique())

In [45]:
list_date_sorted = sorted(set(list_date), key=lambda x: pd.to_datetime(x))

In [46]:
def return_df(type_incident,start_date_train, end_date_train, community_area=None):

    start_date_train = pd.to_datetime(start_date_train)
    end_date_train = pd.to_datetime(end_date_train)
    if community_area != None:
        df = df_src[(df_src.primary_type==type_incident) & (df_src.community_area_name==community_area)]
    else:  
        df = df_Crime[df_Crime.primary_type==type_incident]
    df['year_month'] = df['date'].apply(lambda x: x.strftime('%Y-%m'))
    df_group = df.groupby(['year_month'], as_index=False).agg({'primary_type':'count'})
    df_group.rename(columns={"primary_type":"nb_crime"}, inplace=True)
    df_group['year_month']= pd.to_datetime(df_group['year_month'])
    df_group.sort_values(by='year_month', inplace=True)
    df_group.reset_index(inplace=True, drop=True)
    del df
    df_group['year_month'] = pd.to_datetime(df_group['year_month'], format="%Y%m") + MonthEnd(1)
    df_group.columns = ['ds', 'y']
    return df_group[(df_group['ds'] >=start_date_train) & (df_group['ds'] <= end_date_train)]

In [47]:
start_date_train = "2019-11"
end_date_train = "2022-10"

In [48]:
df_ml_train = return_df("THEFT", start_date_train, end_date_train, "Austin")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year_month'] = df['date'].apply(lambda x: x.strftime('%Y-%m'))


In [50]:
def return_model(df_train):
    m = Prophet(
        growth="linear",
        seasonality_mode='multiplicative',
        daily_seasonality=False,
        weekly_seasonality=False,
        yearly_seasonality=False,
        ).add_seasonality(
        name='yearly',
        period = 365.25,
        fourier_order=20)
    m.fit(df_train)
    return m

In [51]:
model_theft = return_model(df_ml_train)

17:52:51 - cmdstanpy - INFO - Chain [1] start processing
17:52:51 - cmdstanpy - INFO - Chain [1] done processing


In [52]:
def predict_model(m):
    
    # Créer un DataFrame pour les dates futures que vous souhaitez prédire (la période restante)
    start_date_future = pd.to_datetime(end_date_train)
    end_date_future = end_date  # Fin des données
    future_dates = pd.date_range(start=start_date_future, end=end_date_future, freq='1M')
    future_dates = pd.DataFrame({'ds': future_dates})
    return m.predict(future_dates)


In [53]:
forecast = predict_model(model_theft)

In [54]:
forecast.ds.min()

Timestamp('2022-10-31 00:00:00')

In [55]:
forecast.ds.max()

Timestamp('2023-10-31 00:00:00')

In [56]:
data = return_df("THEFT", forecast.ds.min(), forecast.ds.max(),"Austin")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year_month'] = df['date'].apply(lambda x: x.strftime('%Y-%m'))


In [61]:
merged_data = data.merge(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds')

In [63]:
# Fonction pour enregistrer le modèle Prophet dans un fichier
def save_prophet_model(model, filename):
    try:
        joblib.dump(model, filename)
        print(f"Model saved as {filename}")
    except Exception as e:
        print(f"Error saving model: {e}")


In [None]:
# Fonction pour charger le modèle Prophet depuis un fichier
def load_prophet_model(filename):
    try:
        loaded_model = joblib.load(filename)
        print(f"Model loaded from {filename}")
        return loaded_model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None
