### 1. Importing libraries and data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import gc
import operator

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [None]:
%matplotlib inline

In [None]:
path = r'C:\Users\nukis\Documents\Projects\08. Road Safety'

In [None]:
df = pd.read_pickle(os.path.join(path, '01. Data', 'Prepared data', 'road_safety_cleaned.pkl'))

In [None]:
# Command to maximize view of rows and columns

pd.options.display.max_rows = None
pd.options.display.max_columns = len(df.columns)

In [None]:
df.head(2)

In [None]:
df.info()

### 2. Data prep for regression analysis 

In [None]:
df.drop(columns=['Suburb', 'LGA Name', 'Postcode', 'Total Units', 'Total Cas', 'Total Fats', 'Total SI', 'Total MI', 'Lat', 'Lon'], inplace = True)
#df['Postcode'] = df['Postcode'].astype('int64')

In [None]:
df_new = df.groupby('Distance To CC').agg({'Distance To CC': 'count'}).rename(columns = {'Distance To CC': 'Number of Accidents'}).reset_index().sort_values(by='Distance To CC')
df_new.head()

In [None]:
# Create a scatterplot using matplotlib for another look at how the chosen variables plot against each other.

df_new.plot(x = 'Distance To CC', y = 'Number of Accidents', style='o') # The style option creates a scatterplot; without it, we only have lines.
plt.title('Distance to City Center vs Number of Accidents')  
plt.xlabel('Distance to City Center (km)')  
plt.ylabel('Number of Accidents')  
plt.show()

### 3. Linear Regression Analysis

In [None]:
# Reshape the variables into NumPy arrays and put them into separate objects.

X = df_new['Distance To CC'].values.reshape(-1,1)
y = df_new['Number of Accidents'].values.reshape(-1,1)

In [None]:
X

In [None]:
y

In [None]:
# Split data into a training set and a test set.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
regression = LinearRegression() 

In [None]:
# Fit the regression object onto the training set.

regression.fit(X_train, y_train)

In [None]:
# Predict the values of y using X.

y_predicted = regression.predict(X_test)

In [None]:
# Create a plot that shows the regression line from the model on the test set.

plot_test = plt
linear_reg = plot_test.scatter(X_test, y_test, color='blue', s = 15)
plot_test.plot(X_test, y_predicted, color='red', linewidth =3)
plot_test.title('Linear Regression: Distance to City Center vs Number of Accidents', fontsize=10)
plt.xlabel('Distance to City Center (km)', fontsize=9)  
plt.ylabel('Number of Accidents', fontsize=9)
plt.xticks(fontsize=8)
plt.yticks(fontsize=8)
plot_test.show()

In [None]:
# Create objects that contain the model summary statistics.

rmse = mean_squared_error(y_test, y_predicted) # This is the mean squared error
r2 = r2_score(y_test, y_predicted) # This is the R2 score. 

In [None]:
# Print the model summary statistics. This is where you evaluate the performance of the model.

print('Slope:' ,regression.coef_)
print('Mean squared error: ', rmse)
print('R2 score: ', r2)

In [None]:
# Create a dataframe comparing the actual and predicted values of y.

data = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_predicted.flatten()})
data.head(10)

In [None]:
linear_reg.figure.savefig(os.path.join(path, '04. Visualizations', 'linear_reg.png'))

#### Linear regression is definitely not suitable for this data as it performs underfitting.

### 4. Polynomial Regression Analysis

In [None]:
polynomial_features= PolynomialFeatures(degree=2)
X_poly = polynomial_features.fit_transform(X)

In [None]:
regression_2 = LinearRegression() 

In [None]:
# Fit the regression object onto the training set.

regression_2.fit(X_poly, y)

In [None]:
# Predict the values of y using X_poly

y_predicted_poly = regression_2.predict(X_poly)

In [None]:
# Create a plot that shows the regression line from the model on the test set.

plot_test = plt
plot_test.scatter(X_test, y_test, color='gray', s = 15)
plot_test.plot(X, y_predicted_poly, color='red', linewidth =3)
plot_test.title('Distance to City Center vs Number of Accidents')
plt.xlabel('Distance to City Center (km)')  
plt.ylabel('Number of Accidents') 
plot_test.show()

In [None]:
# Create objects that contain the model summary statistics.

rmse_2 = mean_squared_error(y, y_predicted_poly) # This is the mean squared error
r2_2 = r2_score(y, y_predicted_poly) # This is the R2 score.

In [None]:
# Print the model summary statistics. This is where you evaluate the performance of the model.

print('Slope:' ,regression_2.coef_)
print('Mean squared error: ', rmse_2)
print('R2 score: ', r2_2)

#### Polynomial regression is also not a good fit to the data.

In [None]:
gc.collect()