In [None]:
import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import aiohttp
import asyncio
from tqdm.auto import tqdm
from discourse_helpers import get_discourse_data


In [None]:
df = pd.read_csv("all_neon_members.csv")

pd.options.display.max_columns = None

In [None]:
woodshop_mentor_cols = [col for col in df.columns if "woodshop mentor" in col.lower()]
orientation_cols = [col for col in df.columns if ("orientation" in col.lower() or "facility" in col.lower()) and "_date" not in col.lower()]
woodshop_cols = [col for col in df.columns if "woodshop" in col.lower() and "mentor" not in col.lower() and "specialty" not in col.lower()]
cnc_cols = [col for col in df.columns if "cnc" in col.lower() and "tormach" not in col.lower() and "fusion" not in col.lower() and "topographic" not in col.lower()]
private_and_checkout = [col for col in df.columns if "private" in col.lower() or "checkout" in col.lower()]
tormach_cols = [col for col in df.columns if "tormach" in col.lower() and "fusion" not in col.lower()]
sanding_cols = [col for col in df.columns if "sand" in col.lower()]
small_lasers_cols = [col for col in df.columns if "small laser" in col.lower() or "blue" in col.lower()]
big_lasers_cols = [col for col in df.columns if "laser" in col.lower() and "engrave" not in col.lower() and "small" not in col.lower() and "blue" not in col.lower() and "cancelled" not in col.lower() and "mother" not in col.lower() and "cnc" not in col.lower()]
wood_lathe = [col for col in df.columns if "wood lathe" in col.lower()]
milling_cols = [col for col in df.columns if "mill" in col.lower() or "intro to machining" in col.lower()]
specialty_tools_cols = [col for col in df.columns if "specialty" in col.lower() or "domino" in col.lower()]
_3dp_cols = [col for col in df.columns if "3d print" in col.lower() and "resin" not in col.lower()]
resin_3dp_cols = [col for col in df.columns if "resin" in col.lower() and "epoxy" not in col.lower()]
fusion_cols = [col for col in df.columns if "fusion" in col.lower()]
metal_lathe_cols = [col for col in df.columns if "metal lathe" in col.lower()]
sublimation_cols = [col for col in df.columns if "sublimation" in col.lower()]

In [None]:
cols_to_update = {
    "woodshop_mentor_series": woodshop_mentor_cols,
    "orientation_and_facility_tour": orientation_cols,
    "woodshop_safety": woodshop_cols,
    "cnc_router": cnc_cols,
    "tormach": tormach_cols,
    "stationary_sanders": sanding_cols,
    "small_lasers": small_lasers_cols,
    "big_lasers": big_lasers_cols,
    "intro_wood_lathe": wood_lathe,
    "intro_milling": milling_cols,
    "specialty_tools": specialty_tools_cols,
    "filament_3dp": _3dp_cols,
    "resin_3dp": resin_3dp_cols,
    "fusion": fusion_cols,
    "metal_lathe": metal_lathe_cols,
    "sublimation": sublimation_cols
}

In [None]:
for key, value in cols_to_update.items():

    df.insert(len(df.columns), key, False)

    for col in value:
        df.loc[df[col] == True, key] = True

    df.drop(value, axis=1, inplace=True)



In [None]:
df.drop(["Track Saw Update Class", "Cutting Board Build", "Basics of Hand Plane Use", "Aquaponics Discussion", "Low Volume Manufacturers Meetup", "Metal Shop Update"], axis=1, inplace=True)

cols_to_drop = []

for col in df.columns:
    count = df[col].value_counts()
    if count.get(True, 0) < 10 and df.columns.get_loc(col) > 22:
        cols_to_drop.append(col)

df.drop(cols_to_drop, axis=1, inplace=True)

In [None]:
columns = df.columns
new_columns = [columns[i].lower().replace(" ", "_") for i in range(len(columns))]
    
column_dict = dict(zip(columns, new_columns)) 
    
df.rename(columns = column_dict, inplace=True)

df.rename(columns = {'cnc_project_-_machining_3d_topographic_maps': 'cnc_topo'}, inplace=True)



In [None]:
for col in df.columns.to_list()[df.columns.get_loc("total_dollars_spent") + 1:]:
    df[col] = df[col].fillna(False)



In [None]:
df.insert(df.columns.get_loc("discourse_id"), "discourse_posts", 0)
df.insert(df.columns.get_loc("discourse_id"), "discourse_read_time", 0.0)


In [None]:
async with aiohttp.ClientSession() as aio_session:
    for row in tqdm(df.itertuples()):
        if pd.isna(row.discourse_id):
            continue

        discourse_data = await get_discourse_data(aio_session, row.discourse_id)
        if discourse_data is not None:
            df.loc[row.Index, "discourse_posts"] = discourse_data["posts"]
            df.loc[row.Index, "discourse_read_time"] = discourse_data["reading time"]

In [None]:
df.insert(df.columns.get_loc("birthdate") + 1, "age", np.nan)

today = datetime.datetime.today()

df.loc[df["birthdate"].notna(), "age"] = (today - pd.to_datetime(df["birthdate"])).dt.days // 365

In [None]:
df = pd.read_csv("cleaned_neon_members.csv")

In [None]:
df.insert(df.columns.get_loc("address"), "distance_from_asmbly", np.nan)
df.insert(df.columns.get_loc("address"), "time_from_asmbly", np.nan)

In [None]:
import googlemaps
import requests
from config import GOOGLE_MAPS_API_KEY

session = requests.Session()

gmaps = googlemaps.Client(key=GOOGLE_MAPS_API_KEY, requests_session=session)

In [None]:
asmbly_geocode = gmaps.geocode('9701 Dessau Rd Ste 304, Austin, TX 78754')[0]["geometry"]["location"]

print(asmbly_geocode)

In [None]:

def get_distance_from_asmbly(
        gmaps: googlemaps.Client,
        user_address: str | type[np.nan],
        city: str | type[np.nan],
        state: str | type[np.nan],
        zip: str | type[np.nan],
        asmbly_address: list[float]
    ) -> dict[str, float] | dict[str, type[np.nan]]:
    if user_address is np.nan or city is np.nan or state is np.nan or zip is np.nan:
        print("missing part of address")
        return {"distance": np.nan, "time": np.nan}
    try:
        routes = gmaps.directions(
            origin=f"{user_address}, {city}, {state} {zip}",
            destination=asmbly_address,
            mode="driving",
            avoid="tolls",
            departure_time=datetime.datetime.now(),
            region="US",
            language="en",
            units="metric",
            traffic_model="best_guess",
        )
    except googlemaps.exceptions.ApiError as e:
        print(e)
        return {"distance": np.nan, "time": np.nan}
    
    try:
        routes = routes[0]
    except IndexError as e:
        return {"distance": np.nan, "time": np.nan}
    
    distance = routes["legs"][0]["distance"]["value"]
    try:
        time = routes["legs"][0]["duration_in_traffic"]["value"]
    except KeyError:
        time = routes["legs"][0]["duration"]["value"]

    return {"distance": distance, "time": time}

In [None]:
for row in tqdm(df.itertuples()):
    if pd.isna(row.address):
        continue
    result = get_distance_from_asmbly(gmaps, row.address, row.city, row.state, row.zip, asmbly_geocode)
    df.loc[row.Index, "distance_from_asmbly"] = result["distance"]
    df.loc[row.Index, "time_from_asmbly"] = result["time"]
    

In [None]:
df.loc[df["discourse_read_time"] > 0.0, "avg_monthly_discourse_read_time"] = df["discourse_read_time"] / df["membership_count"]
df.loc[df["discourse_posts"] > 0.0, "avg_monthly_discourse_posts"] = df["discourse_posts"] / df["membership_count"]

df["avg_monthly_discourse_posts"].fillna(0, inplace=True)
df["avg_monthly_discourse_read_time"].fillna(0, inplace=True)

In [None]:
df.loc[df["waiver_date"].notna(), "waiver_signed"] = True
df["waiver_signed"].fillna(False, inplace=True)

df.loc[(df["orientation_date"].notna()) | (df["orientation_and_facility_tour"] == True), "orientation_attended"] = True
df["orientation_attended"].fillna(False, inplace=True)

In [None]:
skedda_df = pd.DataFrame()
for csv in ["skedda_bookings_feb2020_to_feb2021.csv", "skedda_bookings_feb2021_to_feb2022.csv", "skedda_bookings_feb2022_to_feb2023.csv", "skedda_bookings_feb2023_to_feb2024.csv"]:
    csv_df = pd.read_csv(csv)
    skedda_df = pd.concat([skedda_df, csv_df])

skedda_df.head()

In [None]:
skedda_df = skedda_df[["Duration (minutes)", "Holder first name", "Holder last name", "Holder email"]]

In [None]:
grouped_df = skedda_df.groupby(["Holder email", "Holder first name", "Holder last name"], as_index=False)["Duration (minutes)"].agg(booking_count = "count", total_booking_minutes = "sum").sort_values(by="total_booking_minutes", ascending=False)

In [None]:
grouped_df.drop(234, inplace=True)

grouped_df.head(10)

In [None]:
pd.options.mode.copy_on_write = True
concat_df = grouped_df[["Holder email", "total_booking_minutes", "booking_count"]]
concat_df.rename(columns={"Holder email": "email"}, inplace=True)

concat_df.head()

In [None]:
df = pd.merge(df, concat_df, on="email", how="left")
df.fillna({"total_booking_minutes": 0.0, "booking_count": 0.0}, inplace=True)

In [None]:
df.loc[df["total_booking_minutes"] > 0.0, "avg_monthly_booking_minutes"] = df["total_booking_minutes"] / df["membership_count"]
df.loc[df["booking_count"] > 0.0, "avg_monthly_booking_count"] = df["booking_count"] / df["membership_count"]

df.fillna({"avg_monthly_booking_minutes": 0.0, "avg_monthly_booking_count": 0.0}, inplace=True)

df.head()

In [None]:
df.replace(np.inf, 0.0, inplace=True)

In [None]:
df = pd.read_csv("neon_members_with_distance.csv")


In [None]:

df.to_csv("neon_members_with_distance.csv", index=False)

In [None]:
columns_to_drop = ["neon_id", "first_name", "last_name", "email", "address", "city", "state", "zip", "phone", "birthdate", "openpath_id", "discourse_id", "waiver_date", "orientation_date", "first_membership_start", "last_membership_end"]

cleaned_df = df.drop(columns=columns_to_drop)

cleaned_df.to_csv("asmbly_cleaned_member_dataset.csv", index=False)