Build a regression model.

In [2]:
import pandas as pd
import statsmodels.api as sm

# Read the CSV file into a DataFrame
df = pd.read_csv('../data/final_df.csv')

# Display the first few rows
df.head()

# Drop rows where 'ratingyelp' or 'ratingfoursq' have missing values
df = df.dropna(subset=['ratingyelp'])
df = df.dropna(subset=['ratingfoursq'])


Provide model output and an interpretation of the results. 

In [3]:
# Multiply 'ratingyelp' by 2 because Foursquare is on a 1-10 scale
df['ratingyelp'] = df['ratingyelp'] * 2

# Convert 'distanceyelp', 'ratingyelp', and 'free_bikes' to numeric, coercing errors to NaN
df['distanceyelp'] = pd.to_numeric(df['distanceyelp'], errors='coerce')
df['ratingyelp'] = pd.to_numeric(df['ratingyelp'], errors='coerce')
df['free_bikes'] = pd.to_numeric(df['free_bikes'], errors='coerce')

# Print the data types of the DataFrame columns
print(df.dtypes)


station                    object
latitude                  float64
longitude                 float64
free_bikes                  int64
empty_slots                 int64
total_bike_cap              int64
uid                        object
renting                     int64
returning                   int64
last_updated                int64
has_ebikes                   bool
ebikes                      int64
payment                    object
payment-terminal             bool
slots                       int64
rental_uris.android        object
rental_uris.ios            object
distanceyelp              float64
poiname                    object
location.address           object
ratingyelp                float64
categories                 object
distancefoursq              int64
namefoursq                 object
location.addressfoursq     object
ratingfoursq              float64
popularityfoursq          float64
categoriesfoursq           object
dtype: object


In [4]:
print(f"Shape of the dataset: {df.shape}")

print(df.head())

print(df.isnull().sum())

print(df.describe())

print(df.info())


Shape of the dataset: (2523, 28)
                    station   latitude  longitude  free_bikes  empty_slots  \
0  Fairfax Dr & Wilson Blvd  38.885801 -77.097745           5            6   
1  Fairfax Dr & Wilson Blvd  38.885801 -77.097745           5            6   
2  Fairfax Dr & Wilson Blvd  38.885801 -77.097745           5            6   
3  Fairfax Dr & Wilson Blvd  38.885801 -77.097745           5            6   
4  Fairfax Dr & Wilson Blvd  38.885801 -77.097745           5            6   

   total_bike_cap                                   uid  renting  returning  \
0              11  08251ded-1f3f-11e7-bf6b-3863bb334450        1          1   
1              11  08251ded-1f3f-11e7-bf6b-3863bb334450        1          1   
2              11  08251ded-1f3f-11e7-bf6b-3863bb334450        1          1   
3              11  08251ded-1f3f-11e7-bf6b-3863bb334450        1          1   
4              11  08251ded-1f3f-11e7-bf6b-3863bb334450        1          1   

   last_updated  ...   

In [11]:
# Select features and drop missing values
X = df[['distanceyelp', 'ratingyelp']].dropna()

# Creating numerical value for category 1 is bar 0 is school
df['is_bar'] = df['categories'].apply(lambda x: 1 if x == 'bar' else 0)

# Align target variable y with the cleaned DataFrame X
y = df['free_bikes'][X.index]



In [12]:
# Characteristics of POIs
X = df[['ratingfoursq', 'ratingyelp', 'is_bar']]
y = df['free_bikes']

# Add a constant to the independent variables
X = sm.add_constant(X)

# Fit the Ordinary Least Squares (OLS) regression model
model = sm.OLS(y, X)
results = model.fit()

# Print the summary of the regression results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:             free_bikes   R-squared:                       0.041
Model:                            OLS   Adj. R-squared:                  0.040
Method:                 Least Squares   F-statistic:                     35.74
Date:                Sun, 20 Oct 2024   Prob (F-statistic):           1.28e-22
Time:                        13:53:34   Log-Likelihood:                -7481.3
No. Observations:                2523   AIC:                         1.497e+04
Df Residuals:                    2519   BIC:                         1.499e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -2.5858      0.923     -2.801   

# Stretch

How can you turn the regression model into a classification model?