In [1]:
# install missing packages
%pip install plotly
%pip install py-cpuinfo
%pip install pandas-datareader

Collecting plotly
  Obtaining dependency information for plotly from https://files.pythonhosted.org/packages/df/79/c80174d711ee26ee5da55a9cc3e248f1ec7a0188b5e4d6bbbbcd09b974b0/plotly-5.17.0-py2.py3-none-any.whl.metadata
  Using cached plotly-5.17.0-py2.py3-none-any.whl.metadata (7.0 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Obtaining dependency information for tenacity>=6.2.0 from https://files.pythonhosted.org/packages/f4/f1/990741d5bb2487d529d20a433210ffa136a367751e454214013b441c4575/tenacity-8.2.3-py3-none-any.whl.metadata
  Using cached tenacity-8.2.3-py3-none-any.whl.metadata (1.0 kB)
Using cached plotly-5.17.0-py2.py3-none-any.whl (15.6 MB)
Using cached tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.17.0 tenacity-8.2.3
Note: you may need to restart the kernel to use updated packages.
Collecting py-cpuinfo
  Using cached py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo


In [2]:
# check system details
import os
import psutil
import cpuinfo

ram_info = psutil.virtual_memory()
print(f"Total RAM: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available RAM: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
print(f"Used RAM: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of RAM: {ram_info.percent}%")
print(f"CPU Cores: {os.cpu_count()}")
print(f"CPU Speed: {cpuinfo.get_cpu_info()['hz_actual_friendly']}")
disk_info = psutil.disk_usage(os.getcwd())
print(f"Total Disk: {disk_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available Disk: {disk_info.free / 1024 / 1024 / 1024:.2f} GB")
print(f"Used Disk: {disk_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of Disk: {disk_info.percent}%")

Total RAM: 15.47 GB
Available RAM: 14.16 GB
Used RAM: 1.06 GB
Percentage Usage Of RAM: 8.5%
CPU Cores: 4
CPU Speed: 2.5000 GHz


In [3]:
# import requirements
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score
from nnet_regressor import Regression

2023-10-04 18:35:10.661490: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# get the data
crime = pd.read_csv("crime_rate.csv")

In [5]:
# split up the data into training and testing
y = crime[["Crimes"]]
X = crime.drop(columns="Crimes")
trainX = X.head(int(0.8 * X.shape[0]))
trainy = y.head(int(0.8 * y.shape[0]))
testX = X.tail(int(0.2 * X.shape[0])).reset_index(drop=True)
testy = y.tail(int(0.2 * y.shape[0])).reset_index(drop=True)

In [6]:
# build the model
print("\n---- Crime Regression Analysis ----\n")
model = Regression(
    name="Tensorflow With Feature Engineering", 
    frac=1,
    rename=False, 
    deep=True,
    time=False, 
    binary=True, 
    imputation=False, 
    variance=True,
    scale=True,
    atwood=True,
    binning=True,
    reciprocal=True, 
    interaction=True, 
    selection=True,
)
try:
    model.load()  # load the machine learning pipeline
    predictions = model.predict(testX)
except:
    model.fit(trainX, trainy)  # build the machine learning pipeline
    predictions = model.predict(testX)


---- Crime Regression Analysis ----

1/6) Model Training
> Transforming Categorical Features
> Removing Constant Features
> Scaling Features
> Selecting Features
> Computing Atwoood Numbers
> Binning Features
> Computing Reciprocals
> Computing Interactions
> Removing Constant Features
> Selecting Features
> Scaling Features
> Transforming The Training Data
> Training Neural Network
15.67 Minutes
2/6) Model Performance
> Transforming The Testing Data
3.43 Seconds
3/6) Model Deployment
> Transforming All The Data
> Training Neural Network
20.46 Minutes
4/6) Model Indicators
> Perturbing Features
13.74 Minutes
5/6) Model Prediction
> Transforming The New Data
1.23 Seconds
6/6) Model Monitoring
4.37 Seconds


In [7]:
# model diagnostics
print("Model Diagnostics:")
print(f"> RMSE: {model.rmse}")
print(f"> R2: {model.r2}")
print(f"> In Control: {model.in_control}")
print("> Model Indicators:")
for i, indicator in enumerate(model.indicators["Indicator"][:10]):
    print(f"{i+1}. {indicator}")
print("> Feature Drift:")
for i, feature in enumerate(model.drift.loc[model.drift["pvalue"] < 0.05, "Feature"][:10]):
    print(f"{i+1}. {feature}")
if model.drift.loc[model.drift["pvalue"] < 0.05].shape[0] == 0:
    print("None")

Model Diagnostics:
> RMSE: 34.24769574899675
> R2: 0.8617342844356017
> In Control: 97.64%
> Model Indicators:
1. GDI
2. CPI*Week_6
3. 1/NASDAQ
4. Crimes(t-3)*GDI
5. Crimes(t-2)*Year_2009
6. Crimes(t-2)*GDP(0.6-0.8)
7. Crimes(t-2)*Unemployment(0.2-0.4)
8. GDP*Dc_Dist_92
9. Crimes(t-1)*Month_1
10. GDI*1/PPI
> Feature Drift:
1. Crimes(t-2)*Dc_Dist_8
2. Dc_Dist_1*1/Unemployment
3. Dc_Dist_2*1/Unemployment
4. Dc_Dist_3*1/Unemployment
5. Unemployment*Dc_Dist_4
6. Unemployment*Dc_Dist_1
7. Unemployment*Dc_Dist_6
8. Dc_Dist_5*Unemployment
9. Unemployment*Dc_Dist_9
10. Unemployment*Dc_Dist_3


In [8]:
# score the model
rmse = mean_squared_error(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
    squared=False,
)
r2 = r2_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)

print(f"RMSE: {rmse}")
print(f"R2: {r2}")

RMSE: 34.96662032193644
R2: 0.832598872461612


In [9]:
# save the machine learning pipeline
model.dump()

In [10]:
# refit the model to include the test data
model.refit(testX, testy)

Model Retraining:
> Transforming The New Data
> Training Neural Network
24.8 Minutes
