### Objectifs de ce notebook :
>Utilisation des données du site https://data.cityofchicago.org/ sur la criminalité

## Prédiction des crimes par type et par région 
- Le nombre de crime est vu comme une série temporelle
- Utilisation de la librairie PROPHET https://facebook.github.io/prophet/docs/quick_start.html
- Visualisation des résultats

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Import des package

In [20]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from prophet.serialize import model_to_json, model_from_json
from prophet import Prophet
import joblib
from pandas.tseries.offsets import MonthEnd

## Chemin des données

In [21]:
Path_Socio = "/home/mlou/Chicago_Crime/data/raw/Census_Data_-_Selected_socioeconomic_indicators_in_Chicago__2008___2012.csv"
Path_Crime = "/home/mlou/Chicago_Crime/data/raw/Crimes_-_2001_to_Present_20231130.csv"

## Chargment des données

In [22]:
df_Socio = pd.read_csv(Path_Socio)
df_Crime = pd.read_csv(Path_Crime, parse_dates=['Date'])

  df_Crime = pd.read_csv(Path_Crime, parse_dates=['Date'])


In [23]:
start_date = df_Crime.Date.min()

In [24]:
end_date = df_Crime.Date.max()

In [25]:
start_date

Timestamp('2019-11-23 00:00:00')

In [26]:
end_date

Timestamp('2023-11-22 00:00:00')

In [27]:
def rename_columns_socio():
    """
    
    """
    return {
'Community Area Number':'community_area_number',
'COMMUNITY AREA NAME':'community_area_name', 
'PERCENT OF HOUSING CROWDED':'pct_housing_crowded', 
'PERCENT HOUSEHOLDS BELOW POVERTY':'pct_households_below_poverty', 
'PERCENT AGED 16+ UNEMPLOYED':'pct_age16_unemployed',
'PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA':'pct_age25_no_highschool',
'PERCENT AGED UNDER 18 OR OVER 64': 'pct_not_working_age',
'per_capita_income':'per_capita_income',
'HARDSHIP INDEX' : 'hardship_index'}

def rename_columns_crimes():
    """
    
    """
    return {
        'ID': 'id',
        'Case Number': 'cas_number', 
        'Date':'date',
        'Block':'block', 
        'IUCR':'iucr', 
        'Primary Type':'primary_type',
        'Description':'description', 
        'Location Description':'location_description', 
        'Arrest':'arrest', 
        'Domestic':'domestic', 
        'Beat':'beat',
        'District':'district', 
        'Ward':'ward', 
        'Community Area': 'community_area_number', 
        'FBI Code':'fbi_code', 
        'X Coordinate':'x_coordinate',
        'Y Coordinate':'y_coordinate', 
        'Year':'year', 
        'Updated On':'updated_on', 
        'Latitude':'latitude', 
        'Longitude':'longitude',
        'Location':'location'   
    }

df_Socio.rename(columns=rename_columns_socio(), inplace=True)
df_Crime.rename(columns=rename_columns_crimes(), inplace=True)

In [28]:
df_src = pd.merge(df_Crime, df_Socio, on='community_area_number', how='left')
df_src.drop(df_src.columns.difference(['primary_type','date', 'community_area_name']), inplace=True, axis=1)

In [29]:
list_date = df_src['date'].apply(lambda x: x.strftime('%Y-%m'))

In [30]:
list_primary_type = list(df_src.primary_type.unique())
list_community_area = list(df_src.community_area_name.unique())

In [31]:
list_date_sorted = sorted(set(list_date), key=lambda x: pd.to_datetime(x))

In [32]:
def return_df(type_incident,start_date_train, end_date_train, community_area=None):

    start_date_train = pd.to_datetime(start_date_train)
    end_date_train = pd.to_datetime(end_date_train)
    if community_area != None:
        df = df_src[(df_src.primary_type==type_incident) & (df_src.community_area_name==community_area)]
    else:  
        df = df_Crime[df_Crime.primary_type==type_incident]
    df['year_month'] = df['date'].apply(lambda x: x.strftime('%Y-%m'))
    df_group = df.groupby(['year_month'], as_index=False).agg({'primary_type':'count'})
    df_group.rename(columns={"primary_type":"nb_crime"}, inplace=True)
    df_group['year_month']= pd.to_datetime(df_group['year_month'])
    df_group.sort_values(by='year_month', inplace=True)
    df_group.reset_index(inplace=True, drop=True)
    del df
    df_group['year_month'] = pd.to_datetime(df_group['year_month'], format="%Y%m") + MonthEnd(1)
    df_group.columns = ['ds', 'y']
    return df_group[(df_group['ds'] >=start_date_train) & (df_group['ds'] <= end_date_train)]

In [33]:
start_date_train = "2019-11"
end_date_train = "2022-10"

In [34]:
df_ml_train = return_df("THEFT", start_date_train, end_date_train, "Austin")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year_month'] = df['date'].apply(lambda x: x.strftime('%Y-%m'))


In [44]:
def return_model(df_train):
    m = Prophet()
    m.fit(df_train)
    return m

In [45]:
model_theft = return_model(df_ml_train)

11:59:27 - cmdstanpy - INFO - Chain [1] start processing
11:59:27 - cmdstanpy - INFO - Chain [1] done processing


In [46]:
def predict_model(m):
    
    # Créer un DataFrame pour les dates futures que vous souhaitez prédire (la période restante)
    start_date_future = pd.to_datetime(end_date_train)
    end_date_future = end_date  # Fin des données
    future_dates = pd.date_range(start=start_date_future, end=end_date_future, freq='1M')
    future_dates = pd.DataFrame({'ds': future_dates})
    return m.predict(future_dates)


In [47]:
forecast = predict_model(model_theft)

In [48]:
forecast.ds.min()

Timestamp('2022-10-31 00:00:00')

In [49]:
forecast.ds.max()

Timestamp('2023-10-31 00:00:00')

In [50]:
data = return_df("THEFT", forecast.ds.min(), forecast.ds.max(),"Austin")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['year_month'] = df['date'].apply(lambda x: x.strftime('%Y-%m'))


In [51]:
merged_data = data.merge(forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']], on='ds')

In [52]:
merged_data

Unnamed: 0,ds,y,yhat,yhat_lower,yhat_upper
0,2022-10-31,193,148.778816,132.467895,165.337117
1,2022-11-30,154,105.553857,89.196618,122.775262
2,2022-12-31,115,182.282155,166.145452,198.811251
3,2023-01-31,134,170.931392,153.451717,187.204391
4,2023-02-28,113,79.760647,62.804375,97.408208
5,2023-03-31,167,109.479644,92.629539,126.591147
6,2023-04-30,150,190.705153,174.060594,206.333409
7,2023-05-31,163,173.444906,155.822933,190.356542
8,2023-06-30,158,175.266424,158.069971,192.802395
9,2023-07-31,154,189.96264,173.101926,207.183078


In [53]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Assuming df is your dataframe with actual values 'y' and predicted values 'yhat'
# You should have df defined as in the previous example

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(merged_data['y'], merged_data['yhat'])

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(merged_data['y'], merged_data['yhat'])

# Calculate R-squared value
r2 = r2_score(merged_data['y'], merged_data['yhat'])

# Print the evaluation metrics
print(f'Mean Squared Error (MSE): {mse}')
print(f'Mean Absolute Error (MAE): {mae}')
print(f'R-squared: {r2}')

Mean Squared Error (MSE): 1487.3043365434692
Mean Absolute Error (MAE): 34.054544353134744
R-squared: -2.5851438150883803


In [63]:
# Fonction pour enregistrer le modèle Prophet dans un fichier
def save_prophet_model(model, filename):
    try:
        joblib.dump(model, filename)
        print(f"Model saved as {filename}")
    except Exception as e:
        print(f"Error saving model: {e}")


In [None]:
# Fonction pour charger le modèle Prophet depuis un fichier
def load_prophet_model(filename):
    try:
        loaded_model = joblib.load(filename)
        print(f"Model loaded from {filename}")
        return loaded_model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None
