Build a regression model.

In [113]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Read the CSV file into a DataFrame
df = pd.read_csv('../data/final_df.csv')

# Display the first few rows
df.head()
df

Unnamed: 0,station,distance_foursq,name_foursq,location.address,rating_foursq,popularityfoursq,categories,latitude_foursq,longitude_foursq,station_yelp,...,renting,returning,last_updated,has_ebikes,ebikes,payment,payment-terminal,slots.1,rental_uris.android,rental_uris.ios
0,Fairfax Dr & Wilson Blvd,97,The Liberty Tavern,3195 Wilson Blvd,8.5,0.982103,bar,38.886352,-77.096879,Fairfax Dr & Wilson Blvd,...,1,1,1729555616,True,0,"['key', 'creditcard']",True,12,https://dc.lft.to/lastmile_qr_scan,https://dc.lft.to/lastmile_qr_scan
1,Fairfax Dr & Wilson Blvd,442,Screwtop Wine Bar,1025 N Fillmore St,8.5,0.965967,bar,38.885366,-77.092691,Fairfax Dr & Wilson Blvd,...,1,1,1729555616,True,0,"['key', 'creditcard']",True,12,https://dc.lft.to/lastmile_qr_scan,https://dc.lft.to/lastmile_qr_scan
2,Fairfax Dr & Wilson Blvd,157,Don Tito,3165 Wilson Blvd,7.3,0.988598,bar,38.886695,-77.096409,Fairfax Dr & Wilson Blvd,...,1,1,1729555616,True,0,"['key', 'creditcard']",True,12,https://dc.lft.to/lastmile_qr_scan,https://dc.lft.to/lastmile_qr_scan
3,Fairfax Dr & Wilson Blvd,311,East West Coffee Wine,3101 Wilson Blvd,7.5,0.973212,bar,38.887544,-77.09536,Fairfax Dr & Wilson Blvd,...,1,1,1729555616,True,0,"['key', 'creditcard']",True,12,https://dc.lft.to/lastmile_qr_scan,https://dc.lft.to/lastmile_qr_scan
4,Fairfax Dr & Wilson Blvd,460,Wilson Hardware Kitchen & Bar,2915 Wilson Blvd,7.3,0.967093,bar,38.888531,-77.093736,Fairfax Dr & Wilson Blvd,...,1,1,1729555616,True,0,"['key', 'creditcard']",True,12,https://dc.lft.to/lastmile_qr_scan,https://dc.lft.to/lastmile_qr_scan


Provide model output and an interpretation of the results. 

In [112]:
# Multiply 'ratingyelp' by 2 because Foursquare is on a 1-10 scale
df['rating_yelp'] = df['rating_yelp'] * 2

# Convert 'distanceyelp', 'ratingyelp', and 'free_bikes' to numeric, coercing errors to NaN
df['distance_yelp'] = pd.to_numeric(df['distance_yelp'], errors='coerce')
df['rating_yelp'] = pd.to_numeric(df['rating_yelp'], errors='coerce')
df['free_bikes'] = pd.to_numeric(df['free_bikes'], errors='coerce')

# Print the data types of the DataFrame columns
print(df.dtypes)


station                 object
distance_foursq          int64
name_foursq             object
location.address        object
rating_foursq          float64
popularityfoursq       float64
categories              object
latitude_foursq        float64
longitude_foursq       float64
station_yelp            object
distance_yelp          float64
name_yelp               object
rating_yelp            float64
latitude_yelp          float64
longitude_yelp         float64
latitude               float64
longitude              float64
free_bikes               int64
empty_slots              int64
slots                  float64
uid                     object
renting                  int64
returning                int64
last_updated             int64
has_ebikes                bool
ebikes                   int64
payment                 object
payment-terminal          bool
slots                    int64
rental_uris.android     object
rental_uris.ios         object
dtype: object


In [98]:
print(f"Shape of the dataset: {df.shape}")

print(df.head())

print(df.isnull().sum())

print(df.describe())

print(df.info())


Shape of the dataset: (145, 32)
                    station  distance_foursq                    name_foursq  \
0  Fairfax Dr & Wilson Blvd               97             The Liberty Tavern   
1  Fairfax Dr & Wilson Blvd              442              Screwtop Wine Bar   
2  Fairfax Dr & Wilson Blvd              157                       Don Tito   
3  Fairfax Dr & Wilson Blvd              311          East West Coffee Wine   
4  Fairfax Dr & Wilson Blvd              460  Wilson Hardware Kitchen & Bar   

     location.address  rating_foursq  popularityfoursq categories  \
0    3195 Wilson Blvd            8.5          0.982103        bar   
1  1025 N Fillmore St            8.5          0.965967        bar   
2    3165 Wilson Blvd            7.3          0.988598        bar   
3    3101 Wilson Blvd            7.5          0.973212        bar   
4    2915 Wilson Blvd            7.3          0.967093        bar   

   latitude_foursq  longitude_foursq              station_yelp  ...  \
0      

In [117]:
# Select features and drop missing values
X = df[['distance_yelp', 'rating_yelp', 'slots']].dropna()

# Creating numerical value for category 1 is bar 0 is school
df['is_bar'] = df['categories'].apply(lambda x: 1 if x == 'bar' else 0)

# Align target variable y with the cleaned DataFrame X
y = df['free_bikes'][X.index]



In [132]:
# Characteristics of POIs
X = df[['rating_foursq', 'rating_yelp', 'distance_yelp', 'slots.1']]
y = df['free_bikes']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the Ordinary Least Squares (OLS) regression model
model = sm.OLS(y, X)
results = model.fit()

# Print the summary of the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.488
Model:                            OLS   Adj. R-squared:                  0.473
Method:                 Least Squares   F-statistic:                     33.36
Date:                Mon, 21 Oct 2024   Prob (F-statistic):           1.57e-19
Time:                        18:09:21   Log-Likelihood:                -384.73
No. Observations:                 145   AIC:                             779.5
Df Residuals:                     140   BIC:                             794.3
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            -1.6389      3.406     -0.481

In [129]:
simpledf = df[['distance_foursq', 'distance_yelp', 'rating_yelp', 'rating_foursq', 'free_bikes', 'categories', 'is_bar', 'slots.1']].copy()
simpledf = simpledf.rename(columns={'slots.1': 'slots1'})
simpledf.head()

Unnamed: 0,distance_foursq,distance_yelp,rating_yelp,rating_foursq,free_bikes,categories,is_bar,slots1
0,97,99.811027,3.8,8.5,2,bar,1,12
1,442,434.646931,4.1,8.5,2,bar,1,12
2,157,154.34178,3.6,7.3,2,bar,1,12
3,311,297.930279,4.0,7.5,2,bar,1,12
4,460,463.276488,3.7,7.3,2,bar,1,12


In [131]:
logit = smf.logit("is_bar ~ distance_yelp + distance_foursq + rating_yelp + rating_foursq + free_bikes + slots1", data=simpledf)
model = logit.fit()
print_model = model.summary()
print(print_model)

Optimization terminated successfully.
         Current function value: 0.023077
         Iterations 14
                           Logit Regression Results                           
Dep. Variable:                 is_bar   No. Observations:                  145
Model:                          Logit   Df Residuals:                      138
Method:                           MLE   Df Model:                            6
Date:                Mon, 21 Oct 2024   Pseudo R-squ.:                  0.6829
Time:                        18:08:32   Log-Likelihood:                -3.3462
converged:                       True   LL-Null:                       -10.553
Covariance Type:            nonrobust   LLR p-value:                   0.02534
                      coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------
Intercept         -45.2530     48.160     -0.940      0.347    -139.644      49.138
distance_yelp

# Stretch

How can you turn the regression model into a classification model?