In [1]:
import pandas as pd
from scipy import stats
import statsmodels.api as sm

In [2]:
file_path = "cancerdeaths.csv"

encodings = ["latin1", "ISO-8859-1", "cp1252"]
for encoding in encodings:
    try:
        df = pd.read_csv(file_path, encoding=encoding)
        print(f"Successfully read the file using {encoding} encoding.")
        break
    except UnicodeDecodeError:
        continue

Successfully read the file using latin1 encoding.


In [3]:
df.head()

Unnamed: 0,zipCode,countyCode,studyCount,State,PovertyEst,povertyPercent,medIncome,Name,popEst2015,County,incidenceRate,avgAnnCount,recentTrend,fiveYearTrend,countyName,deathRate,avgDeathsPerYear,recTrend
0,1001,25013,0,MA,80178,17.7,49072,Hampden County,470690,"Hampden County, Massachusetts(6,10)",442.9,2371.0,stable,-2.4,"Hampden County, Massachusetts",180.0,1004,falling
1,1008,25013,0,MA,80178,17.7,49072,Hampden County,470690,"Hampden County, Massachusetts(6,10)",442.9,2371.0,stable,-2.4,"Hampden County, Massachusetts",180.0,1004,falling
2,1009,25013,3,MA,80178,17.7,49072,Hampden County,470690,"Hampden County, Massachusetts(6,10)",442.9,2371.0,stable,-2.4,"Hampden County, Massachusetts",180.0,1004,falling
3,1010,25013,6,MA,80178,17.7,49072,Hampden County,470690,"Hampden County, Massachusetts(6,10)",442.9,2371.0,stable,-2.4,"Hampden County, Massachusetts",180.0,1004,falling
4,1011,25013,0,MA,80178,17.7,49072,Hampden County,470690,"Hampden County, Massachusetts(6,10)",442.9,2371.0,stable,-2.4,"Hampden County, Massachusetts",180.0,1004,falling


In [4]:
study_count_mean = df["studyCount"].mean()
df["studyCountCategory"] = df["studyCount"].apply(lambda x: "High" if x >= study_count_mean else "Low")
df["studyCountCategory"]

0         Low
1         Low
2        High
3        High
4         Low
         ... 
32546     Low
32547     Low
32548     Low
32549     Low
32550     Low
Name: studyCountCategory, Length: 32551, dtype: object

In [5]:
income_quartiles = df["medIncome"].quantile([0.25, 0.5, 0.75]).values
def categorize_income(income):
    if income <= income_quartiles[0]:
        return "Very Low"
    elif income <= income_quartiles[1]:
        return "Low"
    elif income <= income_quartiles[2]:
        return "High"
    else:
        return "Very High"
df["incomeCategory"] = df["medIncome"].apply(categorize_income)
df["incomeCategory"]

0             High
1             High
2             High
3             High
4             High
           ...    
32546    Very High
32547    Very High
32548    Very High
32549    Very High
32550    Very High
Name: incomeCategory, Length: 32551, dtype: object

In [6]:
missing_values = df.isnull().sum()
print(missing_values)

zipCode               0
countyCode            0
studyCount            0
State                 0
PovertyEst            0
povertyPercent        0
medIncome             0
Name                  0
popEst2015            0
County                0
incidenceRate         0
avgAnnCount           0
recentTrend           0
fiveYearTrend         0
countyName            0
deathRate             0
avgDeathsPerYear      0
recTrend              0
studyCountCategory    0
incomeCategory        0
dtype: int64


In [7]:
stable_group = df[df["recentTrend"] == "stable"]["incidenceRate"]
falling_group = df[df["recentTrend"] == "falling"]["incidenceRate"]
t_stat_trend, p_value_trend = stats.ttest_ind(stable_group, falling_group, equal_var=False)
print("Recent Trend Test: t =", t_stat_trend, ", p =", p_value_trend)

Recent Trend Test: t = 11.57445905827237 , p = 7.8641394163962865e-31


In [8]:
high_study_group = df[df["studyCountCategory"] == "High"]["incidenceRate"]
low_study_group = df[df["studyCountCategory"] == "Low"]["incidenceRate"]
t_stat_study, p_value_study = stats.ttest_ind(high_study_group, low_study_group, equal_var=False)
print("Study Count Test: t =", t_stat_study, ", p =", p_value_study)

Study Count Test: t = 5.594284049628392 , p = 2.4232514253449797e-08


In [9]:
anova_stat_income, p_value_income = stats.f_oneway(
    df[df["incomeCategory"] == "Very Low"]["incidenceRate"],
    df[df["incomeCategory"] == "Low"]["incidenceRate"],
    df[df["incomeCategory"] == "High"]["incidenceRate"],
    df[df["incomeCategory"] == "Very High"]["incidenceRate"]
)
print("Income ANOVA Test: F =", anova_stat_income, ", p =", p_value_income)

Income ANOVA Test: F = 23.68649563799639 , p = 2.6285925112194223e-15


In [10]:
X = df[['medIncome', 'povertyPercent', 'studyCount', 'popEst2015']]
y = df['incidenceRate']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          incidenceRate   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     107.5
Date:                Thu, 27 Feb 2025   Prob (F-statistic):           3.46e-91
Time:                        09:07:08   Log-Likelihood:            -1.7122e+05
No. Observations:               32551   AIC:                         3.425e+05
Df Residuals:                   32546   BIC:                         3.425e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const            454.9549      2.548    178.