Build a regression model.

In [10]:
import pandas as pd
import statsmodels.api as sm

# Load your cleaned data
df = pd.read_csv('../data/bike_and_yelp.csv')

# Inspect the first few rows
print(df.head())
print(df.info())

# Define predictors (independent variables) and target (dependent variable)
X = df[['distance', 'rating']].values  # Predictors
y = df['free_bikes'].values  # Target

# Add a constant to the predictors (for the intercept)
X = sm.add_constant(X)  # Adds a column of ones

# Fit the regression model
model = sm.OLS(y, X).fit()



                                 id          station_name   latitude  \
0  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   
1  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   
2  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   
3  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   
4  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   

   longitude  free_bikes  empty_slots       restaurant_name     distance  \
0  -3.727836           9           14             IGo Pasta  1465.968046   
1  -3.727836           9           14           Botafumeiro   769.947456   
2  -3.727836           9           14                 Lurca    53.496441   
3  -3.727836           9           14             Canaletto   269.769162   
4  -3.727836           9           14  Cafetería Bar Dayton   209.521846   

      address                      categories  rating  review_count  
0  No address    Chinese, Asian Fusion, 

Provide model output and an interpretation of the results. 

In [11]:
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.011
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     4.617
Date:                Fri, 10 Jan 2025   Prob (F-statistic):             0.0101
Time:                        17:00:22   Log-Likelihood:                -2485.1
No. Observations:                 800   AIC:                             4976.
Df Residuals:                     797   BIC:                             4990.
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         10.3506      0.610     16.971      0.0

**Interpretation of Results:**

This regression analysis was performed in order to determine if restaurant distance and rating are correlated with the number of free bikes available. Given the R-square value of 0.011, a poor fit is indicated, since 1.1% of the variation in free bikes is explained by the two variables. This suggests that other factors are stronger indicators that would better explain the free bike availability. Breaking down the variables individually, restaurant distance has a non-significant effect (coefficient = -0.0003, p = 0.338), which indicates no significant relationship between distance and bike availability. However, restaurant rating has a positive and statistically significant effect (coefficient = 0.4603, p = 0.002), suggesting that higher-rated restaurants are associated with more free bikes available. In conclusion, though there is a statistically significant effect in restaurant ratings, the overall model is limited, and further exploration with additional predictors is recommended.

# Stretch

How can you turn the regression model into a classification model?

In [12]:
import pandas as pd
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Load your cleaned data
df = pd.read_csv('../data/bike_and_yelp.csv')

# Inspect the first few rows
print(df.head())
print(df.info())

# Define the classification target
# Convert 'free_bikes' into a binary target variable
# Example: 1 if free_bikes >= median, 0 otherwise
threshold = df['free_bikes'].median()
df['bike_availability'] = (df['free_bikes'] >= threshold).astype(int)

# Define predictors (independent variables) and target (dependent variable)
X = df[['distance', 'rating']]  # Predictors
y = df['bike_availability']     # Target (binary)

# Add a constant to the predictors (for the intercept)
X = sm.add_constant(X)  # Adds a column of ones

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit a logistic regression model using Statsmodels
logit_model = sm.Logit(y_train, X_train).fit()

# Print the logistic regression summary
print(logit_model.summary())

# Predict probabilities on the test set
y_pred_prob = logit_model.predict(X_test)

# Convert probabilities to binary predictions (using a threshold of 0.5)
y_pred = (y_pred_prob >= 0.5).astype(int)

# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


                                 id          station_name   latitude  \
0  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   
1  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   
2  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   
3  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   
4  01f5011f76069b1e81a11d4d51dd9d1d  377 - Metro Abrantes  40.380918   

   longitude  free_bikes  empty_slots       restaurant_name     distance  \
0  -3.727836           9           14             IGo Pasta  1465.968046   
1  -3.727836           9           14           Botafumeiro   769.947456   
2  -3.727836           9           14                 Lurca    53.496441   
3  -3.727836           9           14             Canaletto   269.769162   
4  -3.727836           9           14  Cafetería Bar Dayton   209.521846   

      address                      categories  rating  review_count  
0  No address    Chinese, Asian Fusion, 