Build a regression model.

In [1]:
import pandas as pd
import sqlite3
from sqlite3 import Error
import statsmodels.api as sm

In [2]:
def create_connection(path):
    connection = None
    try:
        connection = sqlite3.connect(path)
        print("Connection to SQLite DB successful")
    except Error as e:
        print(f"The error '{e}' occurred")

    return connection

In [3]:
connection = create_connection("sm_app.sqlite")

Connection to SQLite DB successful


In [4]:
df = pd.read_sql('SELECT * FROM pois', connection)
values = {'rating': df['rating'].mean(), 'review count': df['review count'].mean()}
df = df.fillna(value=values)
df

Unnamed: 0,index,station name,name,address,category,rating,review count,empty slots,available bikes,latitude,longitude
0,0,01. Curtatone,Pizzeria Alle Scalette,Viale Curtatone 18,Pizzeria,4.235426,20.744395,16,0,43.32160,11.327948
1,1,01. Curtatone,Piazza Giacomo Matteotti,Piazza Giacomo Matteotti,Plaza,4.235426,20.744395,16,0,43.32160,11.327948
2,2,01. Curtatone,Basilica di San Domenico,Piazza San Domenico,Church,4.235426,20.744395,16,0,43.32160,11.327948
3,3,01. Curtatone,Caffè La Piazzetta,Via Montanini 52,"Bar, Café, Italian Restaurant",4.235426,20.744395,16,0,43.32160,11.327948
4,4,01. Curtatone,Consorzio Agrario di Siena,Via Pianigiani Giuseppe 9,Grocery Store,4.235426,20.744395,16,0,43.32160,11.327948
...,...,...,...,...,...,...,...,...,...,...,...
408,408,19. Petriccio,la veranda,Piazza Calabria,Italian,4.500000,6.000000,9,1,43.33464,11.304803
409,409,19. Petriccio,Mc Donald's,Via Fiorentina 124,"Fast Food, Burgers",2.000000,1.000000,9,1,43.33464,11.304803
410,410,19. Petriccio,Fontebecci,Via Fiorentina 133,"Pizza, Italian",2.000000,2.000000,9,1,43.33464,11.304803
411,411,19. Petriccio,Bar Cecco,Viale Camillo Benso Conte Di Cavour,"Cafes, Fast Food, Pizza, Coffee & Tea",3.000000,3.000000,9,1,43.33464,11.304803


In [5]:
X = df['available bikes']
y = df['rating']

X = sm.add_constant(X)
lin_reg = sm.OLS(y,X)

Provide model output and an interpretation of the results. 

In [6]:
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:                 rating   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                    0.1582
Date:                Mon, 14 Aug 2023   Prob (F-statistic):              0.691
Time:                        20:35:51   Log-Likelihood:                -350.23
No. Observations:                 413   AIC:                             704.5
Df Residuals:                     411   BIC:                             712.5
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               4.2431      0.034    1

In [None]:
# R-squared value is 0.000, which represents a model that does not explain any of the variation 
# in the response variable around its mean

In [7]:
X = df['available bikes']
y = df['review count']

X = sm.add_constant(X)
lin_reg = sm.OLS(y,X)

In [8]:
model = lin_reg.fit()
print_model = model.summary()
print(print_model)

                            OLS Regression Results                            
Dep. Variable:           review count   R-squared:                       0.007
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     2.804
Date:                Mon, 14 Aug 2023   Prob (F-statistic):             0.0948
Time:                        20:37:21   Log-Likelihood:                -1822.4
No. Observations:                 413   AIC:                             3649.
Df Residuals:                     411   BIC:                             3657.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const              21.8897      1.199     

In [None]:
# R-squared value is 0.007, which means predictive ability is practically zero, and we can't use it to forecast
# P-value is 0.095 which means review count doesn't have impact on the number of bikes

# Stretch

How can you turn the regression model into a classification model?