In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import altair as alt
from textwrap import wrap
from pandas.api.types import CategoricalDtype
pd.set_option('display.float_format', '{:.2f}'.format)

from IPython.display import Image
import researchpy as rp
import math
import scipy 
from scipy import stats
from scipy.stats import ttest_ind, chi2_contingency
import seaborn as sns
import pingouin as pg
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor ### VIF package
from statsmodels.discrete.discrete_model import Logit
from scipy.stats import norm

# Map 'Agree' and 'Disagree' groupings
group_mapping = {
    'Strongly Agree': 'Agree',
    'Somewhat Agree': 'Agree',
    'Strongly Disagree': 'Disagree',
    'Somewhat Disagree': 'Disagree',
    "Don't Know/NA" : 'Unknown'
}
nominal_mapping = {
    'Agree': 1,
    'Disagree': 0,
    "Unknown" : np.nan # Use numpy's NaN to represent missing values
}
nominal_mapping_neg = {
    'Agree': 0,
    'Disagree': 1,
    "Unknown" : np.nan # Use numpy's NaN to represent missing values
}

In [2]:
business_df=pd.read_csv('CP201A Business Data.csv', delimiter = ',')
business_df.rename(columns={"Technology (list all with commas between)":"tech_use",
                        "Financial Stability": "financial_stability",
                        "Family Owned": "family_owned",
                        "Struggle without Technology": "tech_struggle",
                        "Neighborhood Recovered": "neighborhood_recovered",
                        "Person Surveyed": "respondent",
                        "Business Type": "type",
                        "Business Length Open": "length_open",
                        "Own or Rent": "own_rent",
                        "Business ID" : "ID"}, inplace=True)

In [3]:
def ztest_proportions(df_in, x, y):
    # type is either ‘two-sided’, ‘smaller’, ‘larger’ right now just two-sided

    # Calculate counts for Z-test
    counts = df_in.groupby(x)[y].sum()
    totals = df_in[x].value_counts()
    
    count = counts.values
    nobs = totals.values

    count1 = counts[0]
    count2 = counts[1]
    total1 = totals[0]
    total2 = totals[1]

    p1 = count1 / total1
    p2 = count2 / total2

    p_pool = (count1 + count2) / (total1 + total2)

    se = np.sqrt(p_pool * (1 - p_pool) * ((1/total1) + (1/total2)))

    z_stat = (p1 - p2) / se

    # Calculate the p-value (two-tailed test)
    p_value = 2 * (1 - norm.cdf(abs(z_stat)))  # Two-tailed p-value

    return z_stat, p_value

# Financial Stability

Financial Stability is the y, or Dependent variable, in this analysis.

Original question: The financial stability of this business has fully recovered from the COVID-19 pandemic. \[Don’t ask of businesses that have opened since the pandemic.]

`financialrecovery_dv`: positive recovery dummy
`financialrecoveryneg_dv`: negative recovery dummy


In [4]:
# Create a new column for the grouped categories
business_df['financial_group'] = business_df['financial_stability'].map(group_mapping)
# There are some NaNs, replace them with Unknown
business_df.fillna(value={"financial_group": "Unknown"}, inplace = True)

# Set dummy variables 
business_df['financialrecovery_dv'] = business_df['financial_group'].map(nominal_mapping)
business_df['financialrecoveryneg_dv'] = business_df['financial_group'].map(nominal_mapping_neg)
# TODO: do we need financialrecoveryunknown_dv for tests of unknown (respondent H1)?

## Business Length Open
How long as the business been open at this location?

`open_ten_plus_dv`: dummy variable, open 10+ years

`open_four_ten_dv`: dummy variable, open 4-10 years

* Null hypothesis: business age will not affect financial recovery
* H1: If a business opened 10+ years ago, they will feel less financially recovered

In [5]:
# Business Length Open
# Create a new column for filtered length open that excludes 0-3 years
length_mapping = {
    "More than 10 years": "More than 10",
    "0-3 years": np.nan,
    "6-10 years": "4-10",
    "4-5 years": "4-10"
}
business_df['length_open_filtered'] = business_df['length_open'].map(length_mapping)
#The big change: drop the length_open NaNs rows
business_df = business_df.dropna(subset=["length_open_filtered"]) 

business_df['open_ten_plus_dv'] = business_df['length_open_filtered'].map({"More than 10":1,"4-10":0})
business_df['open_four_ten_dv'] = business_df['length_open_filtered'].map({"More than 10":0,"4-10":1}) 

In [6]:
pd.crosstab(index=business_df["open_ten_plus_dv"], columns="Total", dropna=False, margins=True)

col_0,Total,All
open_ten_plus_dv,Unnamed: 1_level_1,Unnamed: 2_level_1
0,49,49
1,89,89
All,138,138


In [7]:
# Test for Statistical Significance
open_sdf = business_df.copy()

stat, pval = ztest_proportions(open_sdf, 'open_ten_plus_dv', 'financialrecoveryneg_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: -1.397094093503129
P-value: 0.16238527775178113


## Business Type

Independent Variable

`type_group`: a category

TODO: no dummy variable set yet

* Null hypothesis: business type will not affect financial recovery
* H1: Groceries/Market will feel more recovered than the other types of businesses (since they are more “critical/required”)

In [60]:
# Clean up Business Type variable
business_type_mapping = {
    "Activity Based": "Other",
    "Healthcare/Education": "Other",
    "Services": "Retail and/or Services",
    "Retail + Services": "Retail and/or Services",
    "Retail": "Retail and/or Services",
    "Restaurant/Bar": "Restaurant/Bar",
    "Grocery/Market": "Grocery/Market",
    "Other":"Other"
}
grocery_dv = {
    "Grocery/Market": 1,
    "Restaurant/Bar": 0,
    "Retail and/or Services": 0,
    "Other": 0
}
other_type_dv = {
    "Grocery/Market": 0,
    "Restaurant/Bar": 0,
    "Retail and/or Services": 0,
    "Other": 1
}
retail_dv = {
    "Grocery/Market": 0,
    "Restaurant/Bar": 0,
    "Retail and/or Services": 1,
    "Other": 0
}
rest_dv = {
    "Grocery/Market": 0,
    "Restaurant/Bar": 1,
    "Retail and/or Services": 0,
    "Other": 0
}
business_df["type_group"] = business_df["type"].map(business_type_mapping)

business_df["grocery_dv"] = business_df["type_group"].map(grocery_dv)
business_df["other_type_dv"] = business_df["type_group"].map(other_type_dv)
business_df["retail_dv"] = business_df["type_group"].map(retail_dv)
business_df["rest_dv"] = business_df["type_group"].map(rest_dv)
pd.crosstab(index=business_df["grocery_dv"], columns=business_df["financial_group"], dropna=False, margins=True, normalize=True)

financial_group,Agree,Disagree,Unknown,All
grocery_dv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.29,0.46,0.12,0.87
1,0.06,0.07,0.01,0.13
All,0.35,0.53,0.12,1.0


In [9]:
# Test for Statistical Significance
berkeley_sdf = business_df.copy()

stat, pval = ztest_proportions(berkeley_sdf, 'grocery_dv', 'financialrecoveryneg_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: 0.2641964974253898
P-value: 0.7916285098650591


## Neighborhood

* Null hypothesis: neighborhood will not affect financial recovery
* H1: Businesses in Berkeley feel more financially recovered

`berkeley_dv`: If the business is in Berkeley

In [45]:
berkeley_map = {
    "Berkeley":1,
    "Downtown Oakland":0,
    "Fruitvale":0,
    "Mission":0,
    "SF Chinatown":0,
}
mission_map = {
    "Berkeley":0,
    "Downtown Oakland":0,
    "Fruitvale":0,
    "Mission":1,
    "SF Chinatown":0,
}
oakland_map = {
    "Berkeley":0,
    "Downtown Oakland":1,
    "Fruitvale":0,
    "Mission":0,
    "SF Chinatown":0,
}
fruitvale_map = {
    "Berkeley":0,
    "Downtown Oakland":0,
    "Fruitvale":1,
    "Mission":0,
    "SF Chinatown":0,
}
chinatown_map = {
    "Berkeley":0,
    "Downtown Oakland":0,
    "Fruitvale":0,
    "Mission":0,
    "SF Chinatown":1,
}

business_df['berkeley_dv'] = business_df['Neighborhood'].map(berkeley_map)
business_df['mission_dv'] = business_df['Neighborhood'].map(mission_map)
business_df['oakland_dv'] = business_df['Neighborhood'].map(oakland_map)
business_df['fruitvale_dv'] = business_df['Neighborhood'].map(fruitvale_map)
business_df['chinatown_dv'] = business_df['Neighborhood'].map(chinatown_map)

# Neighborhood
pd.crosstab(business_df['Neighborhood'], columns=business_df['financialrecoveryneg_dv'], dropna=False, margins=True, normalize=True)

financialrecoveryneg_dv,0.00,1.00,NaN,All
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Berkeley,0.13,0.13,0.03,0.29
Downtown Oakland,0.06,0.13,0.05,0.24
Fruitvale,0.06,0.07,0.01,0.14
Mission,0.04,0.1,0.02,0.16
SF Chinatown,0.07,0.09,0.01,0.17
All,0.4,0.6,0.0,1.0


In [11]:
# Test for Statistical Significance
berkeley_sdf = business_df.copy()

stat, pval = ztest_proportions(berkeley_sdf, 'berkeley_dv', 'financialrecovery_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: -1.6100291542549845
P-value: 0.10739149179385565


In [12]:
# Test for Statistical Significance
mission_sdf = business_df.copy()

stat, pval = ztest_proportions(mission_sdf, 'mission_dv', 'financialrecovery_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: 1.2949065276538312
P-value: 0.19535247639496123


In [46]:
# Test for Statistical Significance
oakland_sdf = business_df.copy()

stat, pval = ztest_proportions(oakland_sdf, 'oakland_dv', 'financialrecovery_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: 1.4574286000000558
P-value: 0.1449981071636124


In [53]:
# Test for Statistical Significance
fruitvale_sdf = business_df.copy()

stat, pval = ztest_proportions(fruitvale_sdf, 'fruitvale_dv', 'financialrecovery_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: -0.5297904926494013
P-value: 0.5962571977253512


In [54]:
# Test for Statistical Significance
chinatown_sdf = business_df.copy()

stat, pval = ztest_proportions(chinatown_sdf, 'chinatown_dv', 'financialrecovery_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: -0.47958315233127224
P-value: 0.6315238284906242


## Neighborhood Recovery

* Null hypothesis: neighborhood recovery will not affect financial recovery
* H1: Businesses that feel the neighborhood has recovered are more likely to feel like their businesses have recovered
* H2: Businesses that do not feel the neighborhood has recovered are more likely to feel like their businesses has not recovered

`neighborhoodrecoveryneg_dv`: negative recovery dummy
`neighborhoodrecovery_dv`: positive recovery dummy

In [13]:
# Create a new column for the grouped categories
business_df['neighborhood_recovered_group'] = business_df['neighborhood_recovered'].map(group_mapping)
# There are some NaNs, replace them with Unknown
business_df.fillna(value={"neighborhood_recovered_group": "Unknown"}, inplace = True)

# Set dummy variables 
business_df['neighborhoodrecovery_dv'] = business_df['neighborhood_recovered_group'].map(nominal_mapping)
business_df['neighborhoodrecoveryneg_dv'] = business_df['neighborhood_recovered_group'].map(nominal_mapping_neg)

In [39]:
pd.crosstab(index=business_df['neighborhood_recovered_group'], columns="Total", normalize=True)

col_0,Total
neighborhood_recovered_group,Unnamed: 1_level_1
Agree,0.29
Disagree,0.6
Unknown,0.11


In [15]:
pd.crosstab(index=business_df["neighborhoodrecoveryneg_dv"], columns=business_df["financialrecoveryneg_dv"], dropna=False, margins=True)

financialrecoveryneg_dv,0.00,1.00,NaN,All
neighborhoodrecoveryneg_dv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.00,25,11,4,40.0
1.00,19,55,9,83.0
,4,7,4,
All,48,73,0,138.0


In [16]:
# Test for Statistical Significance
nrecovery_sdf = business_df.copy()
nrecovery_sdf.dropna(subset=['neighborhoodrecoveryneg_dv'], inplace=True)

stat, pval = ztest_proportions(nrecovery_sdf, 'neighborhoodrecoveryneg_dv', 'financialrecoveryneg_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: -4.038802953367729
P-value: 5.3724672377519767e-05


## Chain

* Null hypothesis: if business is a chain will not affect financial recovery
* H1: Businesses that are a chain will feel more recovered than businesses that are not a chain

`ischain_dv` dummy variable

In [17]:
# Set dummy variable
chain_mapping = {
    "No":0,
    "Yes":1,
    "Unsure":0
}
chainneg_mapping = {
    "No":1,
    "Yes":0,
    "Unsure":0
}
business_df['ischain_dv'] = business_df["Chain"].map(chain_mapping)
business_df['isnotchain_dv'] = business_df["Chain"].map(chainneg_mapping)

In [18]:
pd.crosstab(index=business_df["Chain"], columns="Total", dropna=False, margins=True)

col_0,Total,All
Chain,Unnamed: 1_level_1,Unnamed: 2_level_1
No,112,112
Unsure,5,5
Yes,21,21
All,138,138


In [19]:
# Test for Statistical Significance
chain_sdf = business_df.copy()
#chain_sdf.dropna(subset=['ischain_dv'], inplace=True)

stat, pval = ztest_proportions(chain_sdf, 'isnotchain_dv', 'financialrecoveryneg_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: -2.5092633992643196
P-value: 0.01209832334741523


## Struggle without Technology

"\[If they use technology] Would your business struggle financially without the use of these technologies? Yes/No"

* Null hypothesis: How businesses feel about their technology dependence will not affect how they feel about their financial recovery
* H1: Businesses dependent on technology will feel more financially recovered

`techstruggle_dv`: dummy variable, when answer is Yes (when businesses are dependent on technology for their financial confidence)

In [20]:
tech_struggle_mapping = {
    "No": 0,
    "Yes": 1,
    "Unknown": 0
}
# There are some NaNs, replace them with Unknown
business_df.fillna(value={"tech_struggle": "Unknown"}, inplace = True)

# Set dummy variables 
business_df['techstruggle_dv'] = business_df['tech_struggle'].map(tech_struggle_mapping)

In [21]:
pd.crosstab(index=business_df["tech_struggle"], columns=business_df["financial_group"], dropna=False, margins=True, normalize=True)

financial_group,Agree,Disagree,Unknown,All
tech_struggle,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
No,0.15,0.19,0.05,0.39
Unknown,0.05,0.07,0.03,0.15
Yes,0.14,0.27,0.04,0.46
All,0.35,0.53,0.12,1.0


In [22]:
# Test for Statistical Significance
tech_sdf = business_df.copy()

stat, pval = ztest_proportions(tech_sdf, 'techstruggle_dv', 'financialrecovery_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: 0.6864370089059649
P-value: 0.49243757350424544


## Role of person interviewed

* Null hypothesis: role of person surveyed will not affect financial recovery
* H1: employee more likely to answer unsure/unknown to financial recovery than owner/manager
* H2: owner/manager more likely to feel business has not recovered than employee

Dummy variables:
`respondent_emp_dv`: respondent was an employee or other
`respondent_own_dv`: respondent was a manager or owner

In [23]:
role_group = {
    "Employee": "Employee/Other",
    "Manager": "Manager/Owner",
    "Owner": "Manager/Owner",
    "Other": "Employee/Other"
}
roleemp_mapping = {
    "Employee/Other": 1,
    "Manager/Owner": 0
}
roleown_mapping = {
    "Employee/Other": 0,
    "Manager/Owner": 1
}

business_df["respondent_group"] = business_df["respondent"].map(role_group)

business_df["respondent_emp_dv"] = business_df["respondent_group"].map(roleemp_mapping)
business_df["respondent_own_dv"] = business_df["respondent_group"].map(roleown_mapping)

In [24]:
pd.crosstab(index=business_df["respondent_group"], columns="Total", dropna=False, margins=True, normalize=True)

col_0,Total,All
respondent_group,Unnamed: 1_level_1,Unnamed: 2_level_1
Employee/Other,0.38,0.38
Manager/Owner,0.62,0.62
All,1.0,1.0


In [25]:
# Test for Statistical Significance
role_sdf = business_df.copy()

stat, pval = ztest_proportions(role_sdf, 'respondent_own_dv', 'financialrecoveryneg_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: -1.938135469870469
P-value: 0.052606697042504535


## Rent or Own

* Null hypothesis: if business owns or rents will not affect financial recovery
* H1: businesses that rent are more likely to feel like their businesses have not recovered

In [26]:
own_mapping = {
    "Own": 1,
    "Rent": 0,
    "Unknown": 0
}
rent_mapping = {
    "Own": 0,
    "Rent": 1,
    "Unknown": 0
}
# There are some NaNs, replace them with Unknown
business_df.fillna(value={"own_rent": "Unknown"}, inplace = True)

business_df['rent_dv'] = business_df["own_rent"].map(rent_mapping)
business_df['own_dv'] = business_df["own_rent"].map(own_mapping)

In [27]:
pd.crosstab(index=business_df["own_rent"], columns="Total", dropna=False, margins=True, normalize=True)

col_0,Total,All
own_rent,Unnamed: 1_level_1,Unnamed: 2_level_1
Own,0.1,0.1
Rent,0.83,0.83
Unknown,0.07,0.07
All,1.0,1.0


In [28]:
# Test for Statistical Significance
rent_sdf = business_df.copy()

stat, pval = ztest_proportions(rent_sdf, 'rent_dv', 'financialrecoveryneg_dv')
# Output results
print(f"\nZ-statistic: {stat}")
print(f"P-value: {pval}")


Z-statistic: -1.4490722296212986
P-value: 0.14731741238221208


# Charts

In [29]:
# Prep data for years open v. financial recovery
df_open = pd.DataFrame()
df_open["length_open"] = business_df['length_open_filtered']
df_open["financial_recovery"] = business_df['financial_group']
df_open["type"] = business_df['type_group']
df_open["neighborhood"] = business_df["Neighborhood"]
df_open["Chain"] = business_df["Chain"]

In [30]:
pd.crosstab(index=df_open['type'], columns=df_open['financial_recovery'], margins=True)

financial_recovery,Agree,Disagree,Unknown,All
type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Grocery/Market,8,9,1,18
Other,1,7,4,12
Restaurant/Bar,10,16,6,32
Retail and/or Services,29,41,6,76
All,48,73,17,138


In [31]:
#label cleanup for chart
length_labels = {
    "More than 10": "Before 2013",
    "4-10": "Between 2014-2019"
}
recovery_labels = {
    "Agree": "Fully Recovered",
    "Disagree": "Not Fully Recovered",
    "Unknown": "Unsure/Unknown"
}
chain_labels = {
    "Yes": "Chain",
    "No": "Not a Chain"
}
df_open["length_open"] = df_open["length_open"].map(length_labels)
df_open["financial_recovery"] = df_open["financial_recovery"].map(recovery_labels)
df_open["Chain"] = df_open["Chain"].map(chain_labels)

## Financial Recovery x Business Open Date

In [32]:
# Prep Data
# Grouped bar needs percentage data already calculated
df_open_pct = df_open.copy()
df_open_pct = df_open_pct.groupby(["financial_recovery","length_open"]).size().reset_index(name='counts')
for t in df_open_pct["length_open"].unique():
    df_open_pct.loc[df_open_pct["length_open"] == t, "pct_group"] = df_open_pct["counts"] / df_open_pct.loc[df_open_pct["length_open"] == t, "counts"].sum()


In [33]:
df_open_pct

Unnamed: 0,financial_recovery,length_open,counts,pct_group
0,Fully Recovered,Before 2013,26,0.29
1,Fully Recovered,Between 2014-2019,22,0.45
2,Not Fully Recovered,Before 2013,51,0.57
3,Not Fully Recovered,Between 2014-2019,22,0.45
4,Unsure/Unknown,Before 2013,12,0.13
5,Unsure/Unknown,Between 2014-2019,5,0.1


In [64]:
financial_order = ['Fully Recovered', 'Not Fully Recovered', 'Unsure/Unknown']
alt.Chart(df_open_pct).mark_bar().encode(
    x=alt.X('financial_recovery').title('').axis(labels=False,tickSize=0),
    y=alt.Y('pct_group').axis(format='%').scale(domain=(0,1)).title('Percent of Businesses (N=138)'),
    color=alt.Color('financial_recovery', sort=financial_order).title('Financial Recovery Perception'),
    order = alt.Order('color_financial_recovery_sort_index:Q'),
    column=alt.Column("length_open:N",spacing=32, header=alt.Header(titleOrient='bottom', labelOrient='bottom')).title('Year Business Location Opened')
)
# alt.Chart(df_open).mark_bar(size=50).encode(
#     x=alt.X('length_open').title('Year Business Location Opened').axis(labelAngle=0),
#     y=alt.Y('count(financial_recovery)').stack('normalize').title('Percent of Respondents (N=138)'),
#     color=alt.Color('financial_recovery', sort=financial_order).title('Financial Recovery Perception'),
#     order = alt.Order('color_financial_recovery_sort_index:Q')
# ).properties(width=200)

## Financial Recovery x Business Type

In [35]:
# Data prep for financial recovery x business type
# Grouped bar needs percentage data already calculated
df_type_pct = df_open.copy()
df_type_pct = df_type_pct.groupby(["financial_recovery","type"]).size().reset_index(name='counts')
for t in df_type_pct["type"].unique():
    df_type_pct.loc[df_type_pct["type"] == t, "pct_type"] = df_type_pct["counts"] / df_type_pct.loc[df_type_pct["type"] == t, "counts"].sum()

In [36]:
type_order = ['Grocery/Market', 'Retail and/or Services', 'Restaurant/Bar', 'Other']
alt.Chart(df_type_pct).mark_bar().encode(
    x=alt.X('financial_recovery').title('').axis(labels=False,tickSize=0),
    y=alt.Y('pct_type').axis(format='%').scale(domain=(0,1)).title('Percent of Businesses (N=138)'),
    color=alt.Color('financial_recovery', sort=financial_order).title('Financial Recovery Perception'),
    order = alt.Order('color_financial_recovery_sort_index:Q'),
    column=alt.Column("type:N",sort=type_order, spacing=32, header=alt.Header(titleOrient='bottom', labelOrient='bottom')).title('')
)

## Financial Recovery x Neighborhood

In [37]:
# Prep Data for Neighborhood
# Grouped bar needs percentage data already calculated
df_hood_pct = df_open.copy()
df_hood_pct = df_hood_pct.groupby(["financial_recovery","neighborhood"]).size().reset_index(name='counts')
for t in df_hood_pct["neighborhood"].unique():
    df_hood_pct.loc[df_hood_pct["neighborhood"] == t, "pct_hood"] = df_hood_pct["counts"] / df_hood_pct.loc[df_hood_pct["neighborhood"] == t, "counts"].sum()

In [38]:
neighborhood_order = ['Berkeley', 'Fruitvale', 'SF Chinatown', 'Downtown Oakland', 'Mission']
alt.Chart(df_hood_pct).mark_bar().encode(
    x=alt.X('financial_recovery').title('').axis(labels=False,tickSize=0),
    y=alt.Y('pct_hood').axis(format='%').scale(domain=(0,1)).title('Percent of Businesses (N=138)'),
    color=alt.Color('financial_recovery', sort=financial_order).title('Financial Recovery Perception'),
    order = alt.Order('color_financial_recovery_sort_index:Q'),
    column=alt.Column("neighborhood:N",sort=neighborhood_order, spacing=30, header=alt.Header(titleOrient='bottom', labelOrient='bottom')).title('')
).properties()

## Financial Recovery x Chain

In [49]:
# Prep Data for Chain
# Grouped bar needs percentage data already calculated
df_chain_pct = df_open.copy()
df_chain_pct = df_chain_pct.groupby(["financial_recovery","Chain"]).size().reset_index(name='counts')
for t in df_chain_pct["Chain"].unique():
    df_chain_pct.loc[df_chain_pct["Chain"] == t, "pct_group"] = df_chain_pct["counts"] / df_chain_pct.loc[df_chain_pct["Chain"] == t, "counts"].sum()

df_chain_pct = df_chain_pct[df_chain_pct["Chain"] != "Unsure"]

In [None]:
df_chain_pct

In [50]:
chain_order = ['Chain', 'Not a Chain']
alt.Chart(df_chain_pct).mark_bar().encode(
    x=alt.X('financial_recovery').title('').axis(labels=False,tickSize=0),
    y=alt.Y('pct_group').axis(format='%').scale(domain=(0,1)).title('Percent of Businesses (N=138)'),
    color=alt.Color('financial_recovery', sort=financial_order).title('Financial Recovery Perception'),
    order = alt.Order('color_financial_recovery_sort_index:Q'),
    column=alt.Column("Chain:N",spacing=20, sort=chain_order, header=alt.Header(titleOrient='bottom', labelOrient='bottom')).title('')
)

# Logit Regression

In [51]:
y = business_df['financialrecoveryneg_dv'] #testing negative sentiment
ind_var = ['open_ten_plus_dv', 'isnotchain_dv', 'respondent_own_dv', 'rent_dv', 'neighborhoodrecoveryneg_dv']
#todo: possibly add neighborhood recovery impression, or some neighborhood choice

x = business_df[ind_var].assign(Intercept = 1) #Independent Variables
y.value_counts()

logit_model = sm.Logit(y, x, missing='drop').fit()
logit_model.summary2()

Optimization terminated successfully.
         Current function value: 0.545603
         Iterations 6


0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,financialrecoveryneg_dv,Pseudo R-squared:,0.189
Date:,2024-12-04 13:18,AIC:,132.0327
No. Observations:,110,BIC:,148.2356
Df Model:,5,Log-Likelihood:,-60.016
Df Residuals:,104,LL-Null:,-74.031
Converged:,1.0000,LLR p-value:,3.5913e-05
No. Iterations:,6.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
open_ten_plus_dv,0.9424,0.5019,1.8777,0.0604,-0.0413,1.9260
isnotchain_dv,1.2861,0.6180,2.0809,0.0374,0.0747,2.4974
respondent_own_dv,0.3390,0.5039,0.6728,0.5011,-0.6486,1.3267
rent_dv,0.2868,0.6766,0.4239,0.6717,-1.0393,1.6128
neighborhoodrecoveryneg_dv,1.8283,0.4820,3.7934,0.0001,0.8836,2.7729
Intercept,-2.8707,0.9843,-2.9163,0.0035,-4.7999,-0.9414


In [52]:
# Odds Ratios
or_table = np.exp(logit_model.conf_int()) #Exponentiate Confidence Intervals
or_table['Odds Ratio'] = np.exp(logit_model.params) #Exponentiate Coefficients

or_table.columns = ['5%', '95%', 'Odds Ratio'] #Name Columns
or_table

Unnamed: 0,5%,95%,Odds Ratio
open_ten_plus_dv,0.96,6.86,2.57
isnotchain_dv,1.08,12.15,3.62
respondent_own_dv,0.52,3.77,1.4
rent_dv,0.35,5.02,1.33
neighborhoodrecoveryneg_dv,2.42,16.0,6.22
Intercept,0.01,0.39,0.06


In [61]:
y = business_df['financialrecoveryneg_dv'] #testing negative sentiment
ind_var = ['open_ten_plus_dv', 'isnotchain_dv', 'respondent_own_dv', 'rent_dv', 
           'berkeley_dv', 'mission_dv', 'oakland_dv', 'fruitvale_dv']

x = business_df[ind_var].assign(Intercept = 1) #Independent Variables
y.value_counts()

logit_model_n = sm.Logit(y, x, missing='drop').fit()
logit_model_n.summary2()

Optimization terminated successfully.
         Current function value: 0.591358
         Iterations 5


0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,financialrecoveryneg_dv,Pseudo R-squared:,0.120
Date:,2024-12-04 14:02,AIC:,161.1085
No. Observations:,121,BIC:,186.2707
Df Model:,8,Log-Likelihood:,-71.554
Df Residuals:,112,LL-Null:,-81.269
Converged:,1.0000,LLR p-value:,0.012720
No. Iterations:,5.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
open_ten_plus_dv,1.2737,0.4639,2.7455,0.0060,0.3644,2.1831
isnotchain_dv,1.2498,0.5527,2.2614,0.0237,0.1666,2.3331
respondent_own_dv,0.4984,0.4511,1.1050,0.2692,-0.3857,1.3825
rent_dv,0.6364,0.5985,1.0634,0.2876,-0.5366,1.8094
berkeley_dv,-0.2022,0.6214,-0.3255,0.7448,-1.4202,1.0157
mission_dv,0.9318,0.7469,1.2475,0.2122,-0.5321,2.3958
oakland_dv,0.6647,0.6796,0.9781,0.3280,-0.6673,1.9967
fruitvale_dv,0.0081,0.7136,0.0113,0.9910,-1.3906,1.4068
Intercept,-2.4314,1.0035,-2.4228,0.0154,-4.3982,-0.4645


In [57]:
# Odds Ratios
or_table = np.exp(logit_model_n.conf_int()) #Exponentiate Confidence Intervals
or_table['Odds Ratio'] = np.exp(logit_model_n.params) #Exponentiate Coefficients

or_table.columns = ['5%', '95%', 'Odds Ratio'] #Name Columns
or_table

Unnamed: 0,5%,95%,Odds Ratio
open_ten_plus_dv,1.44,8.87,3.57
isnotchain_dv,1.18,10.31,3.49
respondent_own_dv,0.68,3.98,1.65
rent_dv,0.58,6.11,1.89
berkeley_dv,0.24,2.76,0.82
mission_dv,0.59,10.98,2.54
oakland_dv,0.51,7.36,1.94
fruitvale_dv,0.25,4.08,1.01
Intercept,0.01,0.63,0.09


In [65]:
y = business_df['financialrecoveryneg_dv'] #testing negative sentiment
ind_var = ['open_ten_plus_dv', 'isnotchain_dv', 'respondent_own_dv', 'rent_dv', 
           'other_type_dv', 'retail_dv', 'rest_dv']

x = business_df[ind_var].assign(Intercept = 1) #Independent Variables
y.value_counts()

logit_model_t = sm.Logit(y, x, missing='drop').fit()
logit_model_t.summary2()

Optimization terminated successfully.
         Current function value: 0.592072
         Iterations 6


0,1,2,3
Model:,Logit,Method:,MLE
Dependent Variable:,financialrecoveryneg_dv,Pseudo R-squared:,0.118
Date:,2024-12-04 14:04,AIC:,159.2814
No. Observations:,121,BIC:,181.6477
Df Model:,7,Log-Likelihood:,-71.641
Df Residuals:,113,LL-Null:,-81.269
Converged:,1.0000,LLR p-value:,0.0074178
No. Iterations:,6.0000,Scale:,1.0000

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
open_ten_plus_dv,1.1769,0.4527,2.5997,0.0093,0.2896,2.0641
isnotchain_dv,1.2602,0.5548,2.2713,0.0231,0.1727,2.3476
respondent_own_dv,0.5910,0.4509,1.3108,0.1899,-0.2927,1.4747
rent_dv,0.7769,0.5931,1.3098,0.1902,-0.3856,1.9394
other_type_dv,2.1656,1.2424,1.7431,0.0813,-0.2694,4.6007
retail_dv,0.4665,0.6018,0.7752,0.4382,-0.7130,1.6461
rest_dv,0.6938,0.6917,1.0029,0.3159,-0.6620,2.0495
Intercept,-2.8741,0.9686,-2.9672,0.0030,-4.7726,-0.9756


In [66]:
# Odds Ratios
or_table = np.exp(logit_model_t.conf_int()) #Exponentiate Confidence Intervals
or_table['Odds Ratio'] = np.exp(logit_model_t.params) #Exponentiate Coefficients

or_table.columns = ['5%', '95%', 'Odds Ratio'] #Name Columns
or_table

Unnamed: 0,5%,95%,Odds Ratio
open_ten_plus_dv,1.34,7.88,3.24
isnotchain_dv,1.19,10.46,3.53
respondent_own_dv,0.75,4.37,1.81
rent_dv,0.68,6.95,2.17
other_type_dv,0.76,99.55,8.72
retail_dv,0.49,5.19,1.59
rest_dv,0.52,7.76,2.0
Intercept,0.01,0.38,0.06
