In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import altair as alt
from textwrap import wrap
from pandas.api.types import CategoricalDtype
pd.set_option('display.float_format', '{:.2f}'.format)

# Map 'Agree' and 'Disagree' groupings
group_mapping = {
    'Strongly Agree': 'Agree',
    'Somewhat Agree': 'Agree',
    'Strongly Disagree': 'Disagree',
    'Somewhat Disagree': 'Disagree',
    "Don't Know/NA" : 'Unknown'
}
nominal_mapping = {
    'Agree': 1,
    'Disagree': 0,
    "Unknown" : np.nan # Use numpy's NaN to represent missing values
}

In [20]:
business_df=pd.read_csv('CP201A Business Data.csv', delimiter = ',')
business_df.rename(columns={"Technology (list all with commas between)":"tech_use",
                        "Financial Stability": "financial_stability",
                        "Family Owned": "family_owned",
                        "Struggle without Technology": "tech_struggle",
                        "Neighborhood Recovered": "neighborhood_recovered",
                        "Person Surveyed": "respondent",
                        "Business Type": "type",
                        "Business Length Open": "length_open",
                        "Business ID" : "ID"}, inplace=True)

# Financial Stability

Financial Stability is the y, or Dependent variable, in this analysis.

Original question: The financial stability of this business has fully recovered from the COVID-19 pandemic. \[Don’t ask of businesses that have opened since the pandemic.]

`financial_group`: dummy type


In [21]:
# Create a new column for the grouped categories
business_df['financial_group'] = business_df['financial_stability'].map(group_mapping)
# There are some NaNs, replace them with Unknown
business_df.fillna(value={"financial_group": "Unknown"}, inplace = True)

# Set dummy variables 
business_df['financialrecovery_dv'] = business_df['financial_group'].map(nominal_mapping)

## Business Length Open
How long as the business been open at this location?

`open_ten_plus_dv`: dummy variable, open 10+ years

`open_four_ten_dv`: dummy variable, open 4-10 years

* Null hypothesis: business age will not affect financial recovery
* H1: the older the businesses, the less they feel financially recovered

In [22]:
# Business Length Open
# Create a new column for filtered length open that excludes 0-3 years
length_mapping = {
    "More than 10 years": "More than 10",
    "0-3 years": np.nan,
    "6-10 years": "4-10",
    "4-5 years": "4-10"
}
business_df['length_open_filtered'] = business_df['length_open'].map(length_mapping)
business_df['open_ten_plus_dv'] = business_df['length_open_filtered'].map({"More than 10":1,"4-10":0})
business_df['open_four_ten_dv'] = business_df['length_open_filtered'].map({"More than 10":0,"4-10":1}) 

In [23]:
#The big change: drop the length_open NaNs rows
business_df = business_df.dropna(subset=["length_open_filtered"]) 

## Business Type

Independent Variable

`type_group`: a category

TODO: no dummy variable set yet

* Null hypothesis: business type will not affect financial recovery
* H1: Groceries/Market will feel more recovered than the other types of businesses (since they are more “critical/required”)

In [24]:
# Clean up Business Type variable
business_type_mapping = {
    "Activity Based": "Other",
    "Healthcare/Education": "Other",
    "Services": "Retail and/or Services",
    "Retail + Services": "Retail and/or Services",
    "Retail": "Retail and/or Services",
    "Restaurant/Bar": "Restaurant/Bar",
    "Grocery/Market": "Grocery/Market",
    "Other":"Other"
}
business_df["type_group"] = business_df["type"].map(business_type_mapping)

## Neighborhood

TODO: no dummy variable set yet

In [25]:
# Neighborhood
pd.crosstab(index=business_df["Neighborhood"], columns="total", dropna=False)

col_0,total
Neighborhood,Unnamed: 1_level_1
Berkeley,40
Downtown Oakland,33
Fruitvale,20
Mission,22
SF Chinatown,23


## Neighborhood Recovery

* Null hypothesis: neighborhood recovery will not affect financial recovery
* H1: Businesses that feel the neighborhood has recovered are more likely to feel like their businesses have recovered
* H2: Businesses that do not feel the neighborhood has recovered are more likely to feel like their businesses have recovered

`neighborhood_recovered_group`: dummy

In [28]:
# Create a new column for the grouped categories
business_df['neighborhood_recovered_group'] = business_df['neighborhood_recovered'].map(group_mapping)
# There are some NaNs, replace them with Unknown
business_df.fillna(value={"neighborhood_recovered_group": "Unknown"}, inplace = True)

# Set dummy variables 
business_df['neighborhoodrecovery_dv'] = business_df['neighborhood_recovered_group'].map(nominal_mapping)

In [31]:
pd.crosstab(index=business_df["neighborhood_recovered_group"], columns=business_df["financial_group"], dropna=False, margins=True)

financial_group,Agree,Disagree,Unknown,All
neighborhood_recovered_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Agree,25,11,4,40
Disagree,19,55,9,83
Unknown,4,7,4,15
All,48,73,17,138


## Chain

* Null hypothesis: if business is a chain will not affect financial recovery
* H1: Businesses that are a chain will feel more recovered than businesses that are not a chain

In [36]:
# Set dummy variable
chain_mapping = {
    "No":0,
    "Yes":1,
    "Unsure":0
}
business_df['ischain_df'] = business_df["Chain"].map(chain_mapping)

In [35]:
pd.crosstab(index=business_df["Chain"], columns=business_df["financial_group"], dropna=False, margins=True)

financial_group,Agree,Disagree,Unknown,All
Chain,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,34,65,13,112
Unsure,1,3,1,5
Yes,13,5,3,21
All,48,73,17,138


## Struggle without Technology

"\[If they use technology] Would your business struggle financially without the use of these technologies? Yes/No"

* Null hypothesis: How businesses feel about their technology dependence will not affect how they feel about their financial recovery
* H1: Businesses dependent on technology will feel more financially recovered

`techstruggle_dv`: dummy variable, when answer is Yes (when businesses are dependent on technology for their financial confidence)

In [47]:
tech_struggle_mapping = {
    "No": 0,
    "Yes": 1,
    "Unknown": 0
}
# There are some NaNs, replace them with Unknown
business_df.fillna(value={"tech_struggle": "Unknown"}, inplace = True)

# Set dummy variables 
business_df['techstruggle_dv'] = business_df['tech_struggle'].map(tech_struggle_mapping)

In [46]:
pd.crosstab(index=business_df["tech_struggle"], columns=business_df["financial_group"], dropna=False, margins=True)

financial_group,Agree,Disagree,Unknown,All
tech_struggle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,21,26,7,54
Unknown,7,10,4,21
Yes,20,37,6,63
All,48,73,17,138


## Role of person interviewed

* Null hypothesis: role of person surveyed will not affect financial recovery
* H1: employee more likely to answer unsure/unknown to financial recovery than owner/manager
* H2: owner/manager more likely to feel business has not recovered than employee

TODO: Needs two dummy variables

In [50]:
role_group = {
    "Employee": "Employee/Other",
    "Manager": "Manager/Owner",
    "Owner": "Manager/Owner",
    "Other": "Employee/Other"
}

business_df["respondent_group"] = business_df["respondent"].map(role_group)

In [51]:
pd.crosstab(index=business_df["respondent_group"], columns=business_df["financial_group"], dropna=False, margins=True)

financial_group,Agree,Disagree,Unknown,All
respondent_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Employee/Other,22,22,8,52
Manager/Owner,26,51,9,86
All,48,73,17,138


# Charts

In [40]:
# Prep data for years open v. financial recovery
df_open = pd.DataFrame()
df_open["length_open"] = business_df['length_open_filtered']
df_open["financial_recovery"] = business_df['financial_group']
df_open["type"] = business_df['type_group']
df_open["neighborhood"] = business_df["Neighborhood"]

In [41]:
pd.crosstab(index=df_open['type'], columns=df_open['financial_recovery'], margins=True)

financial_recovery,Agree,Disagree,Unknown,All
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Grocery/Market,8,9,1,18
Other,1,7,4,12
Restaurant/Bar,10,16,6,32
Retail and/or Services,29,41,6,76
All,48,73,17,138


In [242]:
#label cleanup for chart
length_labels = {
    "More than 10": "Before 2013",
    "4-10": "Between 2014-2019"
}
recovery_labels = {
    "Agree": "Fully Recovered",
    "Disagree": "Not Fully Recovered",
    "Unknown": "Unsure/Unknown"
}
df_open["length_open"] = df_open["length_open"].map(length_labels)
df_open["financial_recovery"] = df_open["financial_recovery"].map(recovery_labels)

## Financial Recovery x Business Open Date

In [85]:
financial_order = ['Fully Recovered', 'Not Fully Recovered', 'Unsure/Unknown']
alt.Chart(df_open).mark_bar(size=50).encode(
    x=alt.X('length_open').title('Year Business Location Opened').axis(labelAngle=0),
    y=alt.Y('count(financial_recovery)').stack('normalize').title('Percent of Respondents (N=138)'),
    color=alt.Color('financial_recovery', sort=financial_order).title('Financial Recovery Perception'),
    order = alt.Order('color_financial_recovery_sort_index:Q')
).properties(width=200)

## Financial Recovery x Business Type

In [222]:
# Data prep for financial recovery x business type
# Grouped bar needs percentage data already calculated
df_type_pct = df_open.copy()
df_type_pct = df_type_pct.groupby(["financial_recovery","type"]).size().reset_index(name='counts')
for t in df_type_pct["type"].unique():
    df_type_pct.loc[df_type_pct["type"] == t, "pct_type"] = df_type_pct["counts"] / df_type_pct.loc[df_type_pct["type"] == t, "counts"].sum()

In [228]:
type_order = ['Grocery/Market', 'Retail and/or Services', 'Restaurant/Bar', 'Other']
alt.Chart(df_type_pct).mark_bar().encode(
    x=alt.X('financial_recovery').title('').axis(labels=False,tickSize=0),
    y=alt.Y('pct_type').axis(format='%').scale(domain=(0,1)).title('Percent of Businesses (N=138)'),
    color=alt.Color('financial_recovery', sort=financial_order).title('Financial Recovery Perception'),
    order = alt.Order('color_financial_recovery_sort_index:Q'),
    column=alt.Column("type:N",sort=type_order, spacing=32, header=alt.Header(titleOrient='bottom', labelOrient='bottom')).title('')
).properties()

## Financial Recovery x Neighborhood

In [244]:
# Prep Data for Neighborhood
# Grouped bar needs percentage data already calculated
df_hood_pct = df_open.copy()
df_hood_pct = df_hood_pct.groupby(["financial_recovery","neighborhood"]).size().reset_index(name='counts')
for t in df_hood_pct["neighborhood"].unique():
    df_hood_pct.loc[df_hood_pct["neighborhood"] == t, "pct_hood"] = df_hood_pct["counts"] / df_hood_pct.loc[df_hood_pct["neighborhood"] == t, "counts"].sum()

In [246]:
neighborhood_order = ['Berkeley', 'Fruitvale', 'SF Chinatown', 'Downtown Oakland', 'Mission']
alt.Chart(df_hood_pct).mark_bar().encode(
    x=alt.X('financial_recovery').title('').axis(labels=False,tickSize=0),
    y=alt.Y('pct_hood').axis(format='%').scale(domain=(0,1)).title('Percent of Businesses (N=138)'),
    color=alt.Color('financial_recovery', sort=financial_order).title('Financial Recovery Perception'),
    order = alt.Order('color_financial_recovery_sort_index:Q'),
    column=alt.Column("neighborhood:N",sort=neighborhood_order, spacing=30, header=alt.Header(titleOrient='bottom', labelOrient='bottom')).title('')
).properties()