Build a regression model.

In [None]:
import requests
import pandas as pd
import sqlite3
import json
import statsmodels.api as sm


def get_bike_stations(city):
    # Step 1: Get the networks data
    networks_url = "http://api.citybik.es/v2/networks"
    response = requests.get(networks_url)
    data = response.json()
    
    # Step 2: Find the network ID for the given city
    network_id = None
    for network in data['networks']:
        if network['location']['city'] == city:
            latitude = network["location"]["latitude"]
            longitude = network["location"]["longitude"]
            network_id = network['id']
            break
    
    # Step 3: Retrieve bike station data for the network
    if network_id:
        network_url = f"http://api.citybik.es/v2/networks/{network_id}"
        response = requests.get(network_url)
        data = response.json()
        
        # Step 4: Extract the bike stations
        bike_stations = data['network']['stations']
        return bike_stations,latitude,longitude
    
    return None,None,None

# Specify the city you want to retrieve bike stations for
city = "Boise, ID"

# Call the function to get bike stations and their longitude and latitude in the specified city
stations,latitude,longitude = get_bike_stations(city)

# Create a Pandas dataframe from the bike stations data
if stations:
    cityb_df = pd.DataFrame(stations)
   #print(f"DataFrame of available bike stations in {city}:\n")
    cityb_df['dummy'] = 1
else:
    print(f"No bike stations found in {city}.")
    exit


fs_url = f"https://api.foursquare.com/v3/places/nearby?fields=name%2Cgeocodes%2Crating&ll={latitude}%2C{longitude}&query=bar%2Crestaurant"

headers = {
    "accept": "application/json",
    "Authorization": "fsq3U/xQ8qlJZQAi3EjjyTPks7IoOAF+oLcXuhLx0EIxCz8=" #my api key
}

fs_response = requests.get(fs_url, headers=headers)
fs_data = fs_response.json()

foursquare_df = pd.DataFrame(fs_data)

foursquare_normalized = pd.json_normalize(foursquare_df['results'])

foursquare_df = pd.concat([foursquare_df.drop('results', axis=1), foursquare_normalized], axis=1)
fieldstodrop = ['geocodes.drop_off.longitude','geocodes.drop_off.latitude','geocodes.roof.longitude','geocodes.roof.latitude',]
foursquare_df = foursquare_df.drop(fieldstodrop,axis=1)
foursquare_df['dummy'] = 1

y_url = f"https://api.yelp.com/v3/businesses/search?latitude={latitude}&longitude={longitude}&term=restaurants%2Cbars&radius=1000&sort_by=best_match&limit=20"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer EbdeJbOFX9nyXhkUSSncvvspnB3i0Z1rf4cpvx2_tEwddXnY0iUOgY-QrpeZ9W9CRzpTjic4nQx-zhS269NNyxe9v844HXuuEzXy4h4_ciDoktu9YET_meHYYe-fZHYx"
}

y_response = requests.get(y_url, headers=headers)

y_data = y_response.json()
businesses = y_data['businesses']

data = []

for business in businesses:
    name = business['name']
    y_rating = business['rating']
    coordinates = business['coordinates']
    latitude = coordinates['latitude']
    longitude = coordinates['longitude']
    data.append([name, y_rating, latitude, longitude])

y_df = pd.DataFrame(data, columns=['Name', 'y_Rating', 'y_latitude', 'y_longitude'])
y_df['dummy'] = 1
#joining the data of citybike and foursquare and visualizing them

# Convert dictionary columns to string representation (JSON)
for column in cityb_df.columns:
    if isinstance(cityb_df[column][0], dict):
        cityb_df[column] = cityb_df[column].apply(json.dumps)

for column in foursquare_df.columns:
    if isinstance(foursquare_df[column][0], dict):
        foursquare_df[column] = foursquare_df[column].apply(json.dumps)

# Perform an outer join using the dummy column
joined_df = pd.merge(pd.merge(cityb_df, foursquare_df, on='dummy', how='outer'), y_df, on='dummy', how='outer')

# Remove the dummy column
joined_df = joined_df.drop('dummy', axis=1)

# Store the joined DataFrame in an SQLite database
conn = sqlite3.connect('bike_stations.db')
joined_df.to_sql('bike_stations', conn, if_exists='replace', index=False)
conn.close()

# Preprocess the data
joined_df['free_bikes'] = joined_df['free_bikes'].fillna(0)  # Replace missing values with 0 or an appropriate value
joined_df['rating'] = joined_df['rating'].fillna(0)  # Replace missing values with 0 or an appropriate value

# Prepare the features and target variable
target = 'free_bikes'  
features = ['rating']  

X = joined_df[features]
X = sm.add_constant(X)  # Add a constant term
y = joined_df[target]

# Build the regression model
model = sm.OLS(y, X)
result = model.fit()

if result:
    print("OLS regression model built")


Provide model output and an interpretation of the results. 

In [None]:
# Evaluate the model
print(result.summary())

# based on the provided OLS regression model results, 
# it appears that the rating variable does not have 
# a significant impact on the number of free bikes. 
# The model does not explain a meaningful amount of 
# variation in the dependent variable, and the 
# coefficients and associated p-values suggest that 
# the rating variable is not statistically significant. 

# Stretch

How can you turn the regression model into a classification model?

In [None]:
# To convert the above regression model into a classification one I can:
# 1. Firstly I would classify my data into two main classes "High Availability" and "Low Availability"
#    Based on these classification i would also set a certain threshold parameter to define these classes
#    E.g. I can take the median value of dependent variable i.e, free_bikes in the training data.

# 2. I would assess the performance of the classification model using appropriate evaluation metrics such as 
#    accuracy, precision, recall, F1-score, or area under the ROC curve (AUC-ROC). These metrics will help me 
#    measure the model's ability to classify instances into the correct availability classes.

# 3. I could think of selecting even more features for data consistency such as is the restaurant or bar
#    is open or closed at a certain time, similarly i can add their pricing as a feature in the model as well.

# 4. I would then choose a suitable classification algorithm such as logistic regression or decision trees. 
#    Train the chosen model using the labeled dataset, which includes the features and assigned class labels.

# 5. Evaluate the classification model's performance using appropriate validation techniques like cross-validation. 
#    Fine-tune the model by adjusting hyperparameters or trying different algorithms if necessary to improve 
#    classification accuracy.

# 6. Now once my model is trained and validated, I can use it to predict the availability class of new instances based 
#    on their features. Interpret the results and draw insights from the model's predictions to gain a better understanding 
#    of the factors influencing bike availability.