In [1]:
# Libraries
import os
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
from sklearn import linear_model
import seaborn as sns
from sklearn.model_selection import train_test_split
import mysql.connector

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

ModuleNotFoundError: No module named 'numpy'

In [None]:
def get_data(query):
    host = os.environ.get("MYSQL_HOST", "localhost")
    port = os.environ.get("MYSQL_PORT", "3306")
    dbname = os.environ.get("MYSQL_DBNAME", "test")
    user = os.environ.get("MYSQL_USER", "root")
    password = os.environ.get("MYSQL_PASSWORD", "")

    try:
        conn = mysql.connector.connect( 
            host=host,
            port=port,
            user=user,
            password=password,
            database=dbname
        )
        if conn.is_connected():
            # Create a cursor
            cursor = conn.cursor()
            
            # Execute the query
            cursor.execute(query)
            
            # Fetch the results into a Pandas DataFrame
            result = cursor.fetchall()
            df = pd.DataFrame(result, columns=cursor.column_names)
            return df
        else:
            print("Connection to MySQL database failed.")
    except mysql.connector.Error as err:
        print(f"Error: {err}")

In [None]:
# Getting charger data from MySQL database with custom query
query = "SELECT canton, COUNT(*) AS count FROM charger_data_with_canton GROUP BY canton ORDER BY canton ASC"

# Get data from MySQL database
df_canton_count = get_data(query)
df_canton_count.loc[df_canton_count['canton'] == 'Fribourg', 'canton'] = 'Freiburg'

query = "SELECT * FROM canton_data ORDER By canton ASC"

df_canton_data = get_data(query)

print(df_canton_count)
print(df_canton_data)

# Merge dataframes
df = pd.merge(df_canton_data, df_canton_count, on='canton')

df

In [None]:
# Create train and test samples
X_train, X_test, y_train, y_test = train_test_split(df['inhabitants'], 
                                                    df['count'], 
                                                    test_size=0.20, 
                                                    random_state=42)
# Show X_train
print('X_train:')
print(X_train.head(), '\n')

# Show y_train
print('y_train:')
print(y_train.head())

In [None]:
# Fit the regression model
slope, intercept, r, p, std_err = stats.linregress(X_train, y_train)

# Print results of the regression model
print('Linear regression result:')
print(f'Intercept with y-axis (alpha):            {intercept:.2f}')
print(f'Slope of regression line (beta):          {slope:.3f}')
print(f'p-value:                                  {p:.4f}')
print(f'R-squared (coefficient of determination): {r**2:.4f}')

In [None]:
# Function to calculate model predictions
def myfunc(x):
    return slope * x + intercept

# Apply myfunc() to x, i.e. make predictions 
mymodel = pd.Series(map(myfunc, X_train))

# Scatterplot with regression line
plt.figure(figsize=(6,4))
plt.scatter(X_train, y_train, s=10, color='green')
plt.plot(X_train, mymodel, color='darkred', linestyle='dashed')
plt.title('Simple Linear Regression')
plt.xlabel('inhabitants')
plt.ylabel('count of charging stations')

plt.show()

# Interpretation
So if we look at the given values of the regression, we can see, that we have a p-value of 0.0000 which for us means, that our statistics are statistically relevant. Also our r-squared value is 0.9194, which means, that 91.94% of all chargingstations can be explained through the amount of inhabitant in a canton. Our beta is 0.001 which can interpreted as: If there is one more inhabitant, there would have to be 0.001 more chargingstation.


In [None]:
# Calculate model residuals for train data
print(mymodel)
print(y_train)
residuals = y_train - mymodel
print(residuals)

# Check the first residual value in our data set
print(f'1st Predicted price in dataset: {mymodel.iloc[0]:.2f}')
print(f'1st Observed price in dataset: {y_train.iloc[0]:.2f}')
print(f'1st Residual price in dataset: {residuals.iloc[0]:.2f}')

In [None]:
# Plot histogram of residuals
fig = plt.figure( figsize=(7,4))
n, bins, patches = plt.hist(x=residuals, 
                            bins=5, 
                            color='blue',
                            alpha=0.5
                   )

# Set title and labels
plt.xlabel('residuals', fontsize=10, labelpad=10)
plt.ylabel('frequency', fontsize=10, labelpad=10)
plt.title('Histogram of model residuals', fontsize=12, pad=10)

# Show plot
plt.show()

In [None]:
plt.figure(figsize=(8,1.2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=residuals, color="greenyellow")

In [None]:
# Create model predictions for test data
predicted = myfunc(X_test)
predicted.round(1)

# Get the minimum length of y_test and predicted
min_length = min(len(y_test), len(predicted))

# Compare the observed prices with the predicted prices
for i in range(min_length):
    print(f'Observed count of chargers: {y_test.iloc[i]:.1f}, Predicted count of chargers: {predicted.iloc[i]:.1f}')

# Now we want to check how the regressoin looks without the urban cantons
We define urban as cantons with a popDens higher than 900

In [None]:
# Write query that gets canton_data where rural is true
query = "SELECT * FROM canton_data WHERE rural = TRUE ORDER BY canton ASC"

df_canton_data = get_data(query)

# Merge dataframes
df_rural = pd.merge(df_canton_data, df_canton_count, on='canton')

df_rural

In [None]:
# Create train and test samples
X_train, X_test, y_train, y_test = train_test_split(df_rural['inhabitants'], 
                                                    df_rural['count'], 
                                                    test_size=0.20, 
                                                    random_state=42)
# Show X_train
print('X_train:')
print(X_train.head(), '\n')

# Show y_train
print('y_train:')
print(y_train.head())

In [None]:
# Fit the regression model
slope, intercept, r, p, std_err = stats.linregress(X_train, y_train)

# Print results of the regression model
print('Linear regression result:')
print(f'Intercept with y-axis (alpha):            {intercept:.2f}')
print(f'Slope of regression line (beta):          {slope:.3f}')
print(f'p-value:                                  {p:.4f}')
print(f'R-squared (coefficient of determination): {r**2:.4f}')

In [None]:
# Function to calculate model predictions
def myfunc(x):
    return slope * x + intercept

# Apply myfunc() to x, i.e. make predictions 
mymodel = pd.Series(map(myfunc, X_train))

# Scatterplot with regression line
plt.figure(figsize=(6,4))
plt.scatter(X_train, y_train, s=10, color='green')
plt.plot(X_train, mymodel, color='darkred', linestyle='dashed')
plt.title('Simple Linear Regression')
plt.xlabel('count of charging stations')
plt.ylabel('inhabitants per canton')

plt.show()

In [None]:
# Calculate model residuals for train data
print(mymodel)
print(y_train)
residuals_rural = y_train - mymodel
print(residuals)

# Check the first residual value in our data set
print(f'1st Predicted price in dataset: {mymodel.iloc[0]:.2f}')
print(f'1st Observed price in dataset: {y_train.iloc[0]:.2f}')
print(f'1st Residual price in dataset: {residuals.iloc[0]:.2f}')
residuals

In [None]:
# Plot histogram of residuals
fig = plt.figure( figsize=(7,4))
n, bins, patches = plt.hist(x=residuals, 
                            bins=5, 
                            color='blue',
                            alpha=0.5
                   )

# Set title and labels
plt.xlabel('residuals', fontsize=10, labelpad=10)
plt.ylabel('frequency', fontsize=10, labelpad=10)
plt.title('Histogram of model residuals', fontsize=12, pad=10)

# Show plot
plt.show()

In [None]:
# Create model predictions for test data
predicted = myfunc(X_test)
predicted.round(1)

# Get the minimum length of y_test and predicted
min_length = min(len(y_test), len(predicted))

# Compare the observed prices with the predicted prices
for i in range(min_length):
    print(f'Observed count of chargers: {y_test.iloc[i]:.1f}, Predicted count of chargers: {predicted.iloc[i]:.1f}')

In [None]:
plt.figure(figsize=(8,1.2))
plt.ticklabel_format(style='plain')
sns.boxplot(x=residuals_rural, color="greenyellow")