In [50]:
import pandas as pd
import numpy as np

from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor

In [3]:
path = "../data/MPS/agg_filtered_boroughs_weekends.csv"

In [18]:
df = pd.read_csv(path)
df.TimePeriod = pd.to_datetime(df.TimePeriod)

In [36]:
df.head()

Unnamed: 0,BoroughName,CrimeType,CrimeSubtype,TimePeriod,Value,WorkingDays,Weekends
0,Barking and Dagenham,Arson and Criminal Damage,Arson,2010-04-01,6,20,10
1,Barking and Dagenham,Arson and Criminal Damage,Criminal Damage,2010-04-01,208,20,10
2,Barking and Dagenham,Burglary,Burglary Business and Community,2010-04-01,49,20,10
3,Barking and Dagenham,Burglary,Domestic Burglary,2010-04-01,118,20,10
4,Barking and Dagenham,Drug Offences,Drug Trafficking,2010-04-01,9,20,10


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266921 entries, 0 to 266920
Data columns (total 7 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   BoroughName   266921 non-null  object        
 1   CrimeType     266921 non-null  object        
 2   CrimeSubtype  266921 non-null  object        
 3   TimePeriod    266921 non-null  datetime64[ns]
 4   Value         266921 non-null  int64         
 5   WorkingDays   266921 non-null  int64         
 6   Weekends      266921 non-null  int64         
dtypes: datetime64[ns](1), int64(3), object(3)
memory usage: 14.3+ MB


# Transorm dataframe

### Make dict vectorizer for BoroughName and CrimeSubtype, date features

In [37]:
def makeX(df):
    df_new = df[['WorkingDays', 'Weekends']].copy()
    df_new["year"] = df.TimePeriod.dt.year
    df_new["month"] = df.TimePeriod.dt.month

    vec = DictVectorizer()

    # Convert the selected columns to dictionary and vectorize
    df_dict = df[['BoroughName', 'CrimeSubtype']].to_dict('records')
    vectorized_data = vec.fit_transform(df_dict).toarray()
    df_vectorized = pd.DataFrame(vectorized_data, columns=vec.feature_names_)
    
    df_new = pd.concat([df_new, df_vectorized], axis=1)
    
    return df_new

### Prepare X and y

In [39]:
X = makeX(df)
y = df['Value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

### Train the model

In [48]:
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

In [47]:
alpha = 0.5
l1_ratio = 0.5

In [58]:
%%time
lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
lr.fit(X_train, y_train)

predicted_qualities = lr.predict(X_test)

(rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

print(f"Elasticnet model (alpha={alpha:f}, l1_ratio={l1_ratio:f}):")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R2: {r2}")

Elasticnet model (alpha=0.500000, l1_ratio=0.500000):
  RMSE: 97.61267123552098
  MAE: 52.71150117353197
  R2: 0.06709294659355103
CPU times: user 4.48 s, sys: 16 s, total: 20.4 s
Wall time: 3.83 s


### Second model

In [60]:
%%time

forest = RandomForestRegressor(10, random_state=42)
forest.fit(X_train, y_train)

predicted_qualities2 = forest.predict(X_test)

(rmse, mae, r2) = eval_metrics(y_test, predicted_qualities2)

print(f"Random forest:")
print(f"  RMSE: {rmse}")
print(f"  MAE: {mae}")
print(f"  R2: {r2}")

Random forest:
  RMSE: 46.733928774680045
  MAE: 11.982345228060316
  R2: 0.7861592022463404
CPU times: user 31.3 s, sys: 57.8 ms, total: 31.3 s
Wall time: 31.3 s


In [63]:
# Посмотрим на наши предсказания
filter = y_test > 20
pd.DataFrame({
    "fact": y_test[filter], 
    "predict lr": predicted_qualities[filter].round(2), 
    "predict_rf": predicted_qualities2[filter].round(2),
}).head(20)

Unnamed: 0,fact,predict lr,predict_rf
213541,21,39.57,14.1
213558,98,41.86,110.2
213559,22,40.58,25.1
213562,101,42.75,120.5
213563,42,41.1,42.6
213565,22,41.22,18.9
213566,231,58.95,231.9
213567,72,45.13,92.8
213568,69,45.0,53.2
213570,67,41.14,80.4
