In [None]:
%matplotlib notebook
%matplotlib inline

In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import linregress
from scipy.stats import shapiro
import seaborn as sns #Visualization
from matplotlib.lines import Line2D # for the legend

In [None]:
# Read CSV
csvpath = "output_data/census.csv"
census_df = pd.read_csv(csvpath,index_col="datapoint")
census_df["Zipcode"] = census_df["Zipcode"].astype("category")
census_df

In [None]:
# Create column in DF for ratios of White/Black population and poverty counts AND employed / unemployed
for i in census_df.index:
    census_df.at[i,"% Black Population"] = 100*(census_df.at[i,"Black Population"] / census_df.at[i,"Total Population"])
    census_df.at[i,"% White Population"] = 100*(census_df.at[i,"White Population"] / census_df.at[i,"Total Population"])
    census_df.at[i,"% Unemployed"]       = 100*(census_df.at[i,"Unemployed Count"] / census_df.at[i,"Total Population"])
    census_df.at[i, "% Black Poverty"]   = 100*(census_df.at[i,"Poverty Count - Black"] / census_df.at[i,"Total Population"])
    census_df.at[i, "% White Poverty"]   = 100*(census_df.at[i,"Poverty Count - White"] / census_df.at[i,"Total Population"])
    census_df.at[i,"% Poverty"]       = 100*(census_df.at[i,"Poverty Count"] / census_df.at[i,"Total Population"])
census_df = census_df.drop(columns=["Median Home Value","Median Income","Median Rent"])
census_df

In [None]:
# Create separate DFs for each county
oakland_data = pd.DataFrame(census_df.loc[census_df["County"] == "Oakland"])
kent_data = pd.DataFrame(census_df.loc[census_df["County"] == "Kent"])
saginaw_data = pd.DataFrame(census_df.loc[census_df["County"] == "Saginaw"])

# Ensure zipcodes are categorical
oakland_data["Zipcode"] = oakland_data["Zipcode"].astype("object")
kent_data["Zipcode"] = kent_data["Zipcode"].astype("object")
saginaw_data["Zipcode"] = saginaw_data["Zipcode"].astype("object")

# Export separate dataframes for use in Map Plots ipynb 
oakland_data.to_csv("output_data/oakland_data.csv",index_label="datapoint")
kent_data.to_csv("output_data/kent_data.csv",index_label="datapoint")
saginaw_data.to_csv("output_data/saginaw_data.csv",index_label="datapoint")

In [None]:
# Determine if any of the datasets are normally distributed
oakland_shapiro_df = pd.DataFrame({"Total Population": [shapiro(oakland_data["Total Population"])],
                                   "Adjusted Median Income": [shapiro(oakland_data["Adjusted Median Income"])] ,
                                   "Adjusted Median Rent": [shapiro(oakland_data["Adjusted Median Rent"])] ,
                                   "Adjusted Median Home Value": [shapiro(oakland_data["Adjusted Median Home Value"])] ,
                                   "% Black Population": [shapiro(oakland_data["% Black Population"])] ,
                                   "% White Population": [shapiro(oakland_data["% White Population"])] ,
                                   "% Unemployed": [shapiro(oakland_data["% Unemployed"])] , 
                                   "% Black Poverty":  [shapiro(oakland_data["% Black Poverty"])],
                                   "% White Poverty": [shapiro(oakland_data["% White Poverty"])]
                                  })
oakland_shapiro_df

## None of the p values (second number) are over .05, so the data does not come from a normal distribution

In [None]:
# Determine if any of the datasets are normally distributed
saginaw_shapiro_df = pd.DataFrame({"Total Population": [shapiro(saginaw_data["Total Population"])],
                                   "Adjusted Median Income": [shapiro(saginaw_data["Adjusted Median Income"])] ,
                                   "Adjusted Median Rent": [shapiro(saginaw_data["Adjusted Median Rent"])] ,
                                   "Adjusted Median Home Value": [shapiro(saginaw_data["Adjusted Median Home Value"])] ,
                                   "% Black Population": [shapiro(saginaw_data["% Black Population"])] ,
                                   "% White Population": [shapiro(saginaw_data["% White Population"])] ,
                                   "% Unemployed": [shapiro(saginaw_data["% Unemployed"])] , 
                                   "% Black Poverty":  [shapiro(saginaw_data["% Black Poverty"])],
                                   "% White Poverty": [shapiro(saginaw_data["% White Poverty"])]
                                  })
saginaw_shapiro_df
## None of the p values (second number) are over .05, so the data does not come from a normal distribution

In [None]:
# Determine if any of the datasets are normally distributed
kent_shapiro_df = pd.DataFrame({"Total Population": [shapiro(kent_data["Total Population"])],
                                   "Adjusted Median Income": [shapiro(kent_data["Adjusted Median Income"])] ,
                                   "Adjusted Median Rent": [shapiro(kent_data["Adjusted Median Rent"])] ,
                                   "Adjusted Median Home Value": [shapiro(kent_data["Adjusted Median Home Value"])] ,
                                   "% Black Population": [shapiro(kent_data["% Black Population"])] ,
                                   "% White Population": [shapiro(kent_data["% White Population"])] ,
                                   "% Unemployed": [shapiro(kent_data["% Unemployed"])] , 
                                   "% Black Poverty":  [shapiro(kent_data["% Black Poverty"])],
                                   "% White Poverty": [shapiro(kent_data["% White Poverty"])]
                                  })
kent_shapiro_df
## None of the p values (second number) are over .05, so the data does not come from a normal distribution

In [None]:
# Create summary statistics table with stats for each county (mean / median of each column)
summary_stats = census_df.describe()
summary_stats = summary_stats.drop(columns=['Year'])
summary_stats

In [None]:
Saginaw_df = census_df.loc[census_df['County'] == 'Saginaw']
saginaw_summary = Saginaw_df.describe()
saginaw_summary = saginaw_summary.drop(columns=['Year'])
# saginaw_summary

In [None]:
kent_df = census_df.loc[census_df['County'] == 'Kent']
kent_summary = kent_df.describe()
kent_summary = kent_summary.drop(columns=['Year'])
# kent_summary

In [None]:
Oakland_df = census_df.loc[census_df['County'] == 'Oakland']
oakland_summary = Oakland_df.describe()
oakland_summary = oakland_summary.drop(columns=['Year'])
# oakland_summary

In [None]:
# Create dataframes grouped by zipcode for zipcode analysis
oakland_grouped = oakland_data.groupby("Zipcode").median()
kent_grouped = kent_data.groupby("Zipcode").median()
saginaw_grouped = saginaw_data.groupby("Zipcode").median()

# Income in Each County, by Zipcode **Tamica**

# Analysis for Oakland County Adjusted Median Income

In [None]:
fig = plt.figure(figsize=(18, 10));
xaxis = np.arange(0, len(oakland_grouped.index));
label = oakland_grouped.index.tolist();
plt.scatter(xaxis, oakland_grouped['Adjusted Median Income'],
                    label='');
plt.xticks(ticks=xaxis, labels=label, rotation=90);

plt.xlabel('Zipcode', size = 10);
plt.ylabel('Adjusted Median Income', size = 12);
plt.title('Adjusted Median Income in Oakland County (from 2011-2020)', size = 20);

plt.grid(axis='y');


# Save the figure
plt.savefig('output_data/oakland_cty_adj_income_scatter.png')
# Show plot
plt.show()


In [None]:
# Adjusted Median Income by Year

fig = plt.figure(figsize = (15,8));
xaxis = np.arange(0,len(oakland_grouped.index));
label = (oakland_grouped.index.tolist());
line = plt.plot(xaxis,oakland_grouped['Adjusted Median Income'],label="");
plt.xticks(ticks=xaxis,labels=label,rotation=90);

plt.xlabel("Zipcode");
plt.ylabel("Population");
plt.title("Oakland County Adjusted Median Income by Zipcode (2011-2020)");


#Save fig
plt.savefig('output_data/oakland_adj_med_income_line.png')
plt.show()

In [None]:
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in oakland_data["Zipcode"].unique():
    filtered = oakland_data.loc[oakland_data["Zipcode"] == zipcode];
    filtered = filtered.sort_values(by=["Zipcode","Year"]);
    x = filtered["Year"];
    y = filtered["Adjusted Median Income"];
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y);
    r_values.append(plt_r);
    zipcodes.append(zipcode);
    slopes.append(plt_slope);
    y_ints.append(plt_int);
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints});
equations_df = equations_df.loc[(abs(equations_df["R"]) > .9) & (abs(equations_df["Slope"])> 300)];
equations_df = pd.merge(equations_df,oakland_data,on="Zipcode",how="left");
equations_df = equations_df.sort_values(by=["Zipcode","Year"]);
ann = 150000;
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode];
    x = filtered_list["Year"];
    y = filtered_list["Adjusted Median Income"];
    plt.plot(x,y);
    plt.legend(equations_df["Zipcode"].unique(),loc=1);
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2020,ann));
    ann -= 5500;
plt.title("Oakland County Median Income by Zipcode (2011-2020)");
plt.xlabel("Year");
plt.ylabel("Adjusted Median Income");


#Save Fig
plt.savefig("output_data/oakland_linregress_adj_median_income.png")
plt.show()

In [None]:
#Oakland Cty, linear regression to evaluate
oakland = census_df.loc[census_df['County'] == 'Oakland'];
oakland_df = oakland.sort_values('Year', ascending=True);
oakland_df.set_index('Year', inplace=True);
oakland_df.groupby('Zipcode')['Adjusted Median Income'].plot();
plt.legend(loc='upper left', fontsize='medium', bbox_to_anchor=(-1.4,1.0), ncol= 3);
plt.xlabel('Year', size = 14);
plt.ylabel('Adjusted Median Income', size = 14);
plt.title('Oakland Adjusted Median Income by Zipcode (2011-2020)', size = 16);
plt.xlim(2011, 2020);

(slope, intercept, rvalue, pvalue, stderr) = linregress(oakland_df.index, oakland_df['Adjusted Median Income']);
regress_values = oakland_df.index * slope + intercept;
line_eq = 'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2));
plt.plot(oakland_df.index, regress_values, 'r-');
plt.annotate(line_eq, (2020, 180000), fontsize= 13, color= 'black');
print(f'The r-value is: {rvalue}');

#Save Fig
plt.savefig("output_data/oakland_adj_med_income_value_zipcodes_with_regress")

# Analysis for Kent County Adjusted Median Income

In [None]:
fig = plt.figure(figsize=(18, 10));
xaxis = np.arange(0, len(kent_grouped.index));
label = kent_grouped.index.tolist();
plt.scatter(xaxis, kent_grouped['Adjusted Median Income'],
                    label='');
plt.xticks(ticks=xaxis, labels=label, rotation=90);

plt.xlabel('Zipcode', size = 10);
plt.ylabel('Adjusted Median Income', size = 12);
plt.title('Adjusted Median Income in Kent County (2011-2020)', size = 20);

plt.grid(axis='y');


# Save the figure
plt.savefig('output_data/kent_cty_adj_income_scatter.png')
# Show plot

plt.show()

In [None]:
# Adjusted Median Income by Year

fig = plt.figure(figsize = (15,8));
xaxis = np.arange(0,len(kent_grouped.index));
label = (kent_grouped.index.tolist());
line = plt.plot(xaxis,kent_grouped['Adjusted Median Income'],label="");
plt.xticks(ticks=xaxis,labels=label,rotation=90);

plt.xlabel("Zipcode");
plt.ylabel("Population");
plt.title("Kent County Adjusted Median Income by Zipcode (2011-2020)");



#Save fig
plt.savefig('output_data/kent_adj_med_income_line.png')
plt.show()

In [None]:
fig = plt.figure(figsize = (13,8));
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in kent_data["Zipcode"].unique():
    filtered = kent_data.loc[kent_data["Zipcode"] == zipcode];
    filtered = filtered.sort_values(by=["Zipcode","Year"]);
    x = filtered["Year"];
    y = filtered["Adjusted Median Income"];
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y);
    r_values.append(plt_r);
    zipcodes.append(zipcode);
    slopes.append(plt_slope);
    y_ints.append(plt_int);
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints});
equations_df = equations_df.loc[(abs(equations_df["R"]) > .6) & (abs(equations_df["Slope"])> 300)];
equations_df = pd.merge(equations_df,kent_data,on="Zipcode",how="left");
equations_df = equations_df.sort_values(by=["Zipcode","Year"]);
ann = 70000;
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode];
    x = filtered_list["Year"];
    y = filtered_list["Adjusted Median Income"];
    plt.plot(x,y);
    plt.legend(equations_df["Zipcode"].unique(),loc=1);
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2020,ann));
    ann -= 5500;
plt.title("Kent County Adjusted Median Income by Zipcode (2011-2020)");
plt.xlabel("Year");
plt.ylabel("Adjusted Median Income");

#Save Fig
plt.savefig("output_data/kent_linregress_income.png")
plt.show()

# Analysis for Saginaw County Adjusted Median Income

In [None]:
fig = plt.figure(figsize=(18, 10));
xaxis = np.arange(0, len(saginaw_grouped.index));
label = saginaw_grouped.index.tolist();
plt.scatter(xaxis, saginaw_grouped['Adjusted Median Income'],
                    label='');
plt.xticks(ticks=xaxis, labels=label, rotation=90);

plt.xlabel('Zipcode', size = 10);
plt.ylabel('Adjusted Median Income', size = 12);
plt.title('Adjusted Median Income in Saginaw County (2011-2020)', size = 20);

plt.grid(axis='y');

# Save the figure
plt.savefig('output_data/saginaw_cty_adj_income_scatter.png')

# Show plot
plt.show()

In [None]:
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in saginaw_data["Zipcode"].unique():
    filtered = saginaw_data.loc[saginaw_data["Zipcode"] == zipcode];
    filtered = filtered.sort_values(by=["Zipcode","Year"]);
    x = filtered["Year"];
    y = filtered["Adjusted Median Income"];
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y);
    r_values.append(plt_r);
    zipcodes.append(zipcode);
    slopes.append(plt_slope);
    y_ints.append(plt_int);
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints});
equations_df = equations_df.loc[(abs(equations_df["R"]) > .5) & (abs(equations_df["Slope"])> 300)];
equations_df = pd.merge(equations_df,saginaw_data,on="Zipcode",how="left");
equations_df = equations_df.sort_values(by=["Zipcode","Year"]);
ann = 50000;
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode];
    x = filtered_list["Year"];
    y = filtered_list["Adjusted Median Income"];
    plt.plot(x,y);
    plt.legend(equations_df["Zipcode"].unique(),loc=1);
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2020,ann));
    ann -= 3400;
plt.title("Saginaw County Adjusted Median Income by Zipcode (2011-2020)");
plt.xlabel("Year");
plt.ylabel("Adjusted Median Income");


#Save Fig
plt.savefig("output_data/saginaw_linregress_income_line.png")
plt.show()

In [None]:
#Regression
df = census_df = pd.read_csv(csvpath,index_col="datapoint");
print('\nNumber of rows and columns in the data set: ',df.shape);
print('');
df.head()

In [None]:
plt.scatter(census_df.iloc[:,0],census_df.iloc[:,12]);
plt.xlabel('Year');
plt.ylabel('Adjusted Median Income');

#Save fig
plt.savefig('output_data/all_cty_adj_med_income_scatter.png')
plt.show()

In [None]:
sns.lmplot(x='Year',y='Adjusted Median Income',data=df,aspect=2,height=6);
plt.xlabel('Total Census Period');
plt.ylabel('County Adjusted Median Income');
plt.title('Change in Income Over Time (All Counties)')

#Save fig
plt.savefig('output_data/income_change_over_time_all_cty.png')

In [None]:
plt.figure(figsize=(14,6));
sns.boxplot(x='Year', y='Adjusted Median Income',hue='County',data=df,palette='rainbow');
plt.title('Box plot of Adjusted County Median Income');

#Save fig
plt.savefig('output_data/all_cty_median_income_box.png')

In [None]:
# correlation plot
f = plt.figure(figsize=(19, 15));
plt.matshow(df.corr(), fignum=f.number);
plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=90);
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14);
cb = plt.colorbar();
cb.ax.tick_params(labelsize=14);
plt.title('Correlation Matrix (all data)', fontsize=16);

#Save fig
plt.savefig('output_data/correlation_cmatrix.png')
plt.show()

This graph shows the correlation between each variable listed in the dataframe. Yellow is the correlation score of 1.0. The diagonal line in the center shows that everything is perfectly correlated to itself. The deep teal colored variables show no correlation and the purple colors show the items that have a negative correlation, such as "Adjusted Median Rent & "Total Population"

# Home Values in Each County, by Zipcode **Tamica**

# Analysis for Oakland County Adjusted Median Home Value

In [None]:
fig = plt.figure(figsize=(18, 10));
xaxis = np.arange(0, len(oakland_grouped.index));
label = oakland_grouped.index.tolist();
plt.scatter(xaxis, oakland_grouped['Adjusted Median Home Value'],
                    label='');
plt.xticks(ticks=xaxis, labels=label, rotation=90);
plt.xlabel('Zipcode', size = 10);
plt.ylabel('Adjusted Median Home Value', size = 12);
plt.title('Adjusted Median Home Value in Oakland County (2011-2020)', size = 20);
plt.grid(axis='y');


# Save the figure
plt.savefig('output_data/oakland_cty_adj_income_scatter.png')
# Show plot
plt.show()

In [None]:
# Adjusted Median Home Value by Year

fig = plt.figure(figsize = (15,8));
xaxis = np.arange(0,len(oakland_grouped.index));
label = (oakland_grouped.index.tolist());
line = plt.plot(xaxis,oakland_grouped['Adjusted Median Home Value'],label="");
plt.xticks(ticks=xaxis,labels=label,rotation=90);

plt.xlabel("Zipcode");
plt.ylabel("Population");
plt.title("Oakland County Median Home Value by Zipcode (2011-2020)");


#Save fig
plt.savefig('output_data/oakland_cty_adj_home_value_line.png')
plt.show()

In [None]:
ig = plt.figure(figsize = (15,8));
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in oakland_data["Zipcode"].unique():
    filtered = oakland_data.loc[oakland_data["Zipcode"] == zipcode];
    filtered = filtered.sort_values(by=["Zipcode","Year"]);
    x = filtered["Year"];
    y = filtered["Adjusted Median Home Value"];
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y);
    r_values.append(plt_r);
    zipcodes.append(zipcode);
    slopes.append(plt_slope);
    y_ints.append(plt_int);
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints});
equations_df = equations_df.loc[(abs(equations_df["R"]) > .8) & (abs(equations_df["Slope"])> 300)];
equations_df = pd.merge(equations_df,oakland_data,on="Zipcode",how="left");
equations_df = equations_df.sort_values(by=["Zipcode","Year"]);
ann = 400000;
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode];
    x = filtered_list["Year"];
    y = filtered_list["Adjusted Median Home Value"];
    plt.plot(x,y);
    plt.legend(equations_df["Zipcode"].unique(),loc=1);    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2020,ann));
    ann -= 20200;
plt.title("Oakland County Adjusted Median Home Value by Zipcode (2011-2020)");
plt.xlabel("Year");
plt.ylabel("Adjusted Median Home Value");


#Save Fig
plt.savefig("output_data/oakland_linregress_adj_median_home_value.png")
plt.show()

In [None]:
#Oakland Cty, linear regression to evaluate
oakland = census_df.loc[census_df['County'] == 'Oakland'];
oakland_df = oakland.sort_values('Year', ascending=True);
oakland_df.set_index('Year', inplace=True);
oakland_df.groupby('Zipcode')['Adjusted Median Home Value'].plot();
plt.legend(loc='upper left', fontsize='medium', bbox_to_anchor=(-1.4,1.0), ncol= 3);
plt.xlabel('Year', size = 14);
plt.ylabel('Adjusted Median Home Value', size = 14);
plt.title('Oakland Adjusted Median Home Value by Zipcode (2011-2020)', size = 16);
plt.xlim(2011, 2020);

(slope, intercept, rvalue, pvalue, stderr) = linregress(oakland_df.index, oakland_df['Adjusted Median Home Value']);
regress_values = oakland_df.index * slope + intercept;
line_eq = 'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2));
plt.plot(oakland_df.index, regress_values, 'r-');
plt.annotate(line_eq, (2020, 180000), fontsize= 13, color= 'black');
print(f'The r-value is: {rvalue}');

#Save Fig
plt.savefig("output_data/oakland_adj_med_home_value_zipcodes_with_regress")

# Analysis for Kent County Adjusted Median Home Value

In [None]:
fig = plt.figure(figsize=(18, 10));
xaxis = np.arange(0, len(kent_grouped.index));
label = kent_grouped.index.tolist();
plt.scatter(xaxis, kent_grouped['Adjusted Median Home Value'],
                    label='');
plt.xticks(ticks=xaxis, labels=label, rotation=90);
plt.xlabel('Zipcode', size = 10);
plt.ylabel('Adjusted Median Home Value', size = 12);
plt.title('Adjusted Median Home Value in Kent County (2011-2020)', size = 20);
plt.grid(axis='y');

# Save the figure
plt.savefig('output_data/kent_cty_adj_med_home_value_scatter.png')

# Show plot
plt.show()

In [None]:
# Adjusted Median Home Value by Year

fig = plt.figure(figsize = (15,8));
xaxis = np.arange(0,len(kent_grouped.index));
label = (kent_grouped.index.tolist());
line = plt.plot(xaxis,kent_grouped['Adjusted Median Home Value'],label="");
plt.xticks(ticks=xaxis,labels=label,rotation=90);

plt.xlabel("Zipcode");
plt.ylabel("Adjusted Median Home Value");
plt.title("Kent County Median Home Value by Zipcode (2011-2020)");

#Save fig
plt.savefig('output_data/kent_cty_adj_home_value_line.png')
plt.show()

In [None]:
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in kent_data["Zipcode"].unique():
    filtered = kent_data.loc[kent_data["Zipcode"] == zipcode];
    filtered = filtered.sort_values(by=["Zipcode","Year"]);
    x = filtered["Year"];
    y = filtered["Adjusted Median Home Value"];
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y);
    r_values.append(plt_r);
    zipcodes.append(zipcode);
    slopes.append(plt_slope);
    y_ints.append(plt_int);
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints});
equations_df = equations_df.loc[(abs(equations_df["R"]) > .7) & (abs(equations_df["Slope"])> 300)];
equations_df = pd.merge(equations_df,kent_data,on="Zipcode",how="left");
equations_df = equations_df.sort_values(by=["Zipcode","Year"]);
ann = 200000;
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode];
    x = filtered_list["Year"];
    y = filtered_list["Adjusted Median Home Value"];
    plt.plot(x,y);
    plt.legend(equations_df["Zipcode"].unique(),loc=1);    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2020,ann));
    ann -= 7200;
plt.title("Kent County Median Home Value by Zipcode (2011-2020)");
plt.xlabel("Year");
plt.ylabel("Adjusted Median Home Value");

# Save the figure
plt.savefig('output_data/kent_linregress_adj_median_home_value.png')
plt.show()

# Analysis for Saginaw County Adjusted Median Home Value

In [None]:
# Create scatter plot(s) for Home Values in each county (Adjusted)
# Saginaw

fig = plt.figure(figsize=(18, 10));
xaxis = np.arange(0, len(saginaw_grouped.index));
label = saginaw_grouped.index.tolist();
plt.scatter(xaxis, saginaw_grouped['Adjusted Median Home Value'],
                    label='');
plt.xticks(ticks=xaxis, labels=label, rotation=90);

plt.xlabel('Zipcode', size = 10);
plt.ylabel('Adjusted Median Home Value', size = 12);
plt.title('Adjusted Median Home Value in Saginaw County from 2011-2020', size = 20);

plt.grid(axis='y');

# Save the figure
plt.savefig('output_data/saginaw_cty_adj_home_values.png')

# Show plot
plt.show()

In [None]:
# Adjusted Median Home Value by Year

fig = plt.figure(figsize = (15,8));
xaxis = np.arange(0,len(saginaw_grouped.index));
label = (saginaw_grouped.index.tolist());
line = plt.plot(xaxis,saginaw_grouped['Adjusted Median Home Value'],label="");
plt.xticks(ticks=xaxis,labels=label,rotation=90);

plt.xlabel("Zipcode");
plt.ylabel("Adjusted Median Home Value");
plt.title("Saginaw County Median Home Value by Zipcode (2011-2020)");


#Save fig
plt.savefig('output_data/saginaw_cty_adj_home_value_line.png')
plt.show()


In [None]:
fig = plt.figure(figsize = (15,8));
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in saginaw_data["Zipcode"].unique():
    filtered = saginaw_data.loc[saginaw_data["Zipcode"] == zipcode];
    filtered = filtered.sort_values(by=["Zipcode","Year"]);
    x = filtered["Year"];
    y = filtered["Adjusted Median Home Value"];
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y);
    r_values.append(plt_r);
    zipcodes.append(zipcode);
    slopes.append(plt_slope);
    y_ints.append(plt_int);
    
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints});
equations_df = equations_df.loc[(abs(equations_df["R"]) > .5) & (abs(equations_df["Slope"])> 300)];
equations_df = pd.merge(equations_df,saginaw_data,on="Zipcode",how="left");
equations_df = equations_df.sort_values(by=["Zipcode","Year"]);
ann = 150000;
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode];
    x = filtered_list["Year"];
    y = filtered_list["Adjusted Median Home Value"];
    plt.plot(x,y);
    plt.legend(equations_df["Zipcode"].unique(),loc=1);
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2020,ann));
    ann -= 3000;
plt.title("Saginaw County Median Home Value by Zipcode (2011-2020)");
plt.xlabel("Year");
plt.ylabel("Adjusted Median Home Value");


# Save the figure
plt.savefig('output_data/saginaw_linregress_adj_home_value.png')
plt.show()

In [None]:
fig = plt.figure(figsize = (15,8));
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in saginaw_data["Zipcode"].unique():
    filtered = saginaw_data.loc[saginaw_data["Zipcode"] == zipcode];
    filtered = filtered.sort_values(by=["Zipcode","Year"]);
    x = filtered["Year"];
    y = filtered["Adjusted Median Home Value"];
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y);
    r_values.append(plt_r);
    zipcodes.append(zipcode);
    slopes.append(plt_slope);
    y_ints.append(plt_int);
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints});
equations_df = equations_df.loc[(abs(equations_df["R"]) > .8) & (abs(equations_df["Slope"])> 300)];
equations_df = pd.merge(equations_df,saginaw_data,on="Zipcode",how="left");
equations_df = equations_df.sort_values(by=["Zipcode","Year"]);
ann = 120000;
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode];
    x = filtered_list["Year"];
    y = filtered_list["Adjusted Median Home Value"];
    plt.plot(x,y);
    plt.legend(equations_df["Zipcode"].unique(),loc=1);    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2020,ann));
    ann -= 10000;
plt.title("Saginaw County Adjusted Median Home Value by Zipcode (2011-2020)");
plt.xlabel("Year");
plt.ylabel("Adjusted Median Home Value");


#Save Fig
plt.savefig("output_data/saginaw_linregress_adj_median_home_value.png")
plt.show()

# Variable Comparisons **Tamica**

In [None]:
plt.figure(figsize=(14,6));
sns.boxplot(x='Year', y='Adjusted Median Home Value',hue='County',data=df,palette='rainbow');
plt.title('Box plot of Adjusted Median Home Value');

#Save the figure
plt.savefig('output_data/box_median home_value_counties.png')

In [None]:
f = plt.figure(figsize=(14,6));
ax = f.add_subplot(121);
sns.scatterplot(x='Year',y='Adjusted Median Income',data=df,palette='magma',hue='County',ax=ax);
ax.set_title('Scatter plot of County Adjusted Median Income');

ax = f.add_subplot(122);
sns.scatterplot(x='Year',y='Adjusted Median Home Value',data=df,palette='viridis',hue='County');
ax.set_title('Scatter plot of County Adjusted Median Home Value');

#Save plot
plt.savefig('output_data/income_home_value_comparitive_scatters_all_counties.png')

In [None]:
f = plt.figure(figsize=(14,6));
ax = f.add_subplot(121);
sns.scatterplot(x='Year',y='Adjusted Median Rent',data=df,palette='plasma',hue='County',ax=ax);
ax.set_title('Scatter plot of County Adjusted Median Rent');

ax = f.add_subplot(122);
sns.scatterplot(x='Year',y='Adjusted Median Home Value',data=df,palette='coolwarm',hue='County');
ax.set_title('Scatter plot of County Adjusted Median Home Value');

#Save plot
plt.savefig('output_data/income_rent_comparitive_scatters_all_counties.png')

In [None]:
f = plt.figure(figsize=(14,6));
ax = f.add_subplot(121);
sns.scatterplot(x='Adjusted Median Rent',y='Adjusted Median Home Value',data=df,palette='magma',hue='County',ax=ax);
ax.set_title('Scatter plot of County Adjusted Median Home Value vs. Adjusted Median Rent');

#Save plot
plt.savefig('output_data/income_rent_comparitive_scatters_all_counties.png')

In [None]:
for zipcode in oakland_data["Zipcode"].unique():
    filtered = oakland_data.loc[oakland_data["Zipcode"] == zipcode];
    filtered = filtered.sort_values(by=["Zipcode","Year"]);
    x = filtered["Year"];
    y = filtered["Adjusted Median Home Value"];

# correlation plot
f = plt.figure(figsize=(19, 15));
plt.matshow(df.corr(), fignum=f.number);
plt.xticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14, rotation=90);
plt.yticks(range(df.select_dtypes(['number']).shape[1]), df.select_dtypes(['number']).columns, fontsize=14);
cb = plt.colorbar();
cb.ax.tick_params(labelsize=14);
plt.title('Correlation Matrix (Oakland County)', fontsize=16,y=1)


#Save fig
plt.savefig('output_data/oakland_correlation_cmatrix.png')

In [None]:
f = plt.figure(figsize=(14,6));
ax = f.add_subplot(121);
sns.scatterplot(x='Unemployed Count',y='Poverty Count - White',data=df,palette='Paired',hue='County',ax=ax);
ax.set_title('Scatter plot of County Unemployed Count vs. Poverty Count - White');

#Save plot
plt.savefig('output_data/unemployment_pcw_comparitive_scatter_all_counties.png')

plt.show()

In [None]:
f = plt.figure(figsize=(14,6));
ax = f.add_subplot(121);
sns.scatterplot(x='Unemployed Count',y='Poverty Count - Black',data=df,palette='cividis',hue='County',ax=ax);
ax.set_title('Scatter plot of County Unemployed Count vs. Poverty Count - Black');

#Save plot
plt.savefig('output_data/unemployment_pcb_comparitive_scatter_all_counties.png')

plt.show()

In [None]:
f = plt.figure(figsize=(14,6));
ax = f.add_subplot(121);
sns.scatterplot(x='Adjusted Median Income',y='Poverty Count - Black',data=df,palette='inferno',hue='County',ax=ax);
ax.set_title('Scatter plot of County Adjusted Median Income vs. Poverty Count - Black');

#Save plot
plt.savefig('output_data/adj_median_inc_pcb_comparitive_scatter_all_counties.png')

plt.show()


In [None]:
# Read second census file (only adj info)
csvpath = "output_data/census2.csv"
census2_df = pd.read_csv(csvpath,index_col="datapoint")
census2_df["Zipcode"] = census2_df["Zipcode"].astype("category")
census2_df

In [None]:
# Create dataframes grouped by zipcode for zipcode analysis
oakland_grouped = oakland_data.groupby("Zipcode").median()
kent_grouped = kent_data.groupby("Zipcode").median()
saginaw_grouped = saginaw_data.groupby("Zipcode").median()

In [None]:
f = plt.figure(figsize=(14,6));
correlation=census2_df.corr();
heatmap=sns.heatmap(correlation, annot=True);
plt.title('Correlation Matrix(all counties)', fontsize=16);

#Save fig
plt.savefig('output_data/heatmap_correlation_cmatrix.png')

plt.show()

In [None]:
# Read second census file (only adj info)
csvpath = "output_data/census_oakland.csv";
census_oakland_df = pd.read_csv(csvpath,index_col="datapoint");
census_oakland_df["Zipcode"] = census_oakland_df["Zipcode"].astype("category");
census_oakland_df;

f = plt.figure(figsize=(14,6));
correlation=census_oakland_df.corr();
heatmap=sns.heatmap(correlation, annot=True);
plt.title('Correlation Matrix (Oakland)', fontsize=16);

#Save fig
plt.savefig('output_data/oakland_heatmap_correlation_cmatrix.png')

plt.show()

In [None]:
#plotting Adjusted Median Income vs Poverty Count - White
plt.title('Adjusted Median Income vs % Poverty - White');
plt.xlabel('Adjusted Median Income');
plt.ylabel('% Poverty - White');
plt.xticks(rotation=45);
plt.ylim(0,40)
plt.scatter(df['Adjusted Median Income'], (100*df['Poverty Count - White']/df["Total Population"]),color='c');

#Save fig
plt.savefig('output_data/income_vs_pcw.png')

In [None]:
#plotting Adjusted Median Income vs Poverty Count - Black
plt.title('Adjusted Median Income vs % Poverty - Black');
plt.xlabel('Adjusted Median Income');
plt.ylabel('% Poverty - Black');
plt.xticks(rotation=45);
plt.scatter(df['Adjusted Median Income'], (100*df['Poverty Count - Black']/df["Total Population"]));

#Save fig
plt.savefig('output_data/income_vs_pcb.png')

In [None]:
###RHI ADJUSTED###
#plotting Employed Count vs Total Population
plt.title('Total Population vs % Employed');
plt.xlabel('Total Population');
plt.ylabel('% Employed');
plt.scatter(df['Total Population'], (100*df['Employed Count']/df['Total Population']));
#plt.scatter(df['Employed Count'], df['Total Population'], label='Employed Count');

#Save fig
plt.savefig('output_data/employed_count_vs_total_pop.png')

In [None]:
#plotting Adjusted Median Income vs Adjusted Median Home Value
plt.title('Adjusted Median Income vs Adjusted Median Home Value');
plt.xlabel('Adjusted Median Income');
plt.ylabel('Adjusted Median Home Value');
plt.xticks(rotation=45)
plt.scatter(df['Adjusted Median Income'], df['Adjusted Median Home Value'], label='Adjusted Median Income');

#Save fig
plt.savefig('output_data/adj_med_income_vs_adj_med_home_value.png')

In [None]:
#plotting Adjusted Median Income vs Adjusted Median Rent
plt.title('Adjusted Median Income vs Adjusted Median Rent');
plt.xlabel('Adjusted Median Income');
plt.ylabel('Adjusted Median Rent');
plt.xticks(rotation=45);
plt.scatter(df['Adjusted Median Income'], df['Adjusted Median Rent'],color='c');

#Save fig
plt.savefig('output_data/adj_med_income_vs_adj_med_rent.png')

# Rent Prices in Each County, by Zipcode **Stephanie**

# Saginaw County Median Rent vs Year by Zipcode

In [None]:
Saginaw_df = census_df.loc[census_df['County'] == 'Saginaw']
Saginaw_df = Saginaw_df.sort_values('Year', ascending=True)
Saginaw_df.set_index('Year', inplace=True)
Saginaw_df.groupby('Zipcode')['Adjusted Median Rent'].plot()
plt.legend(loc=(1.01,0.01), ncol= 2)
plt.xlabel('Year')
plt.ylabel('Adjusted Median Rent')
plt.title('Saginaw Median Rent by Zipcode (2011-2020)')
plt.xlim(2011, 2020)
plt.savefig("output_data/saginaw_rent_zipcodes")

In [None]:
# calculate and add linear regression to evaluate
Saginaw = census_df.loc[census_df['County'] == 'Saginaw']
Saginaw_df = Saginaw.sort_values('Year', ascending=True)
Saginaw_df.set_index('Year', inplace=True)
Saginaw_df.groupby('Zipcode')['Adjusted Median Rent'].plot()
plt.legend(loc=(1.01,0.01), ncol= 2)
plt.xlabel('Year')
plt.ylabel('Adjusted Median Rent')
plt.title('Saginaw Median Rent by Zipcode (2011-2020)')
plt.xlim(2011, 2020)
(slope, intercept, rvalue, pvalue, stderr) = linregress(Saginaw_df.index, Saginaw_df['Adjusted Median Rent'])
regress_values = Saginaw_df.index * slope + intercept
line_eq = 'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2))
plt.plot(Saginaw_df.index, regress_values, 'r-')
plt.annotate(line_eq, (2016, 400), fontsize= 12, color= 'red')
print(f'The r-value is: {rvalue}')
plt.savefig("output_data/saginaw_rent_zipcodes_with_regression")

# Oakland County Median Rent vs Year by Zipcode

In [None]:
Oakland_df = census_df.loc[census_df['County'] == 'Oakland']
Oakland_df = Oakland_df.sort_values('Year', ascending=True)
Oakland_df.set_index('Year', inplace=True)
Oakland_df.groupby('Zipcode')['Adjusted Median Rent'].plot()
plt.legend(loc=(1.01,0.01), ncol= 4)
plt.xlabel('Year')
plt.ylabel('Adjusted Median Rent')
plt.title('Oakland Median Rent by Zipcode (2011-2020)')
plt.xlim(2011,2020)
plt.savefig("output_data/oakland_rent_zipcodes")

In [None]:
# calculate linear regression to evaluate
Oakland_df = census_df.loc[census_df['County'] == 'Oakland']
Oakland_df = Oakland_df.sort_values('Year', ascending=True)
Oakland_df.set_index('Year', inplace=True)
Oakland_df.groupby('Zipcode')['Adjusted Median Rent'].plot()
plt.legend(loc=(1.01,0.01), ncol= 4)
plt.xlabel('Year')
plt.ylabel('Adjusted Median Rent')
plt.title('Oakland Median Rent by Zipcode (2011-2020)')
plt.xlim(2011,2020)
(slope, intercept, rvalue, pvalue, stderr) = linregress(Oakland_df.index, Oakland_df['Adjusted Median Rent'])
regress_values = Oakland_df.index * slope + intercept
line_eq = 'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2))
plt.plot(Oakland_df.index, regress_values, 'r-')
plt.annotate(line_eq, (2011, 500), fontsize= 12, color= 'red')
print(f'The r-value is: {rvalue}')
plt.savefig("output_data/oakland_rent_zipcodes_with_regression")

# Kent County Median Rent vs Year by Zipcode


In [None]:
Kent_df = census_df.loc[census_df['County'] == 'Kent']
Kent_df = Kent_df.sort_values('Year', ascending=True)
Kent_df.set_index('Year', inplace=True)
Kent_df.groupby('Zipcode')['Adjusted Median Rent'].plot()
plt.legend(loc=(1.01,0.01), ncol= 2)
plt.xlabel('Year')
plt.ylabel('Adjusted Median Rent')
plt.title('Kent Median Rent by Zipcode (2011-2020)')
plt.xlim(2011,2020)
plt.savefig("output_data/kent_rent_zipcodes")

In [None]:
# Calculate the linear regression and correlation for each county's plot
Kent_df = census_df.loc[census_df['County'] == 'Kent']
Kent_df = Kent_df.sort_values('Year', ascending=True)
Kent_df.set_index('Year', inplace=True)
Kent_df.groupby('Zipcode')['Adjusted Median Rent'].plot()
plt.legend(loc=(1.01,0.01), ncol= 2)
plt.xlabel('Year')
plt.ylabel('Adjusted Median Rent')
plt.title('Kent Median Rent by Zipcode (2011-2020)')
plt.xlim(2011,2020)
(slope, intercept, rvalue, pvalue, stderr) = linregress(Kent_df.index, Kent_df['Adjusted Median Rent'])
regress_values = Kent_df.index * slope + intercept
line_eq = 'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2))
plt.plot(Kent_df.index, regress_values, 'r-')
plt.annotate(line_eq, (2011, 500), fontsize= 12, color= 'red')
print(f'The r-value is: {rvalue}')
plt.savefig("output_data/kent_rent_zipcodes_with_regression")

In [None]:
# Calculate and plot average rent for each county
saginaw_county_mean_rent = Saginaw_df.groupby('Year')['Adjusted Median Rent'].mean()
oakland_county_mean_rent = Oakland_df.groupby('Year')['Adjusted Median Rent'].mean()
kent_county_mean_rent = Kent_df.groupby('Year')['Adjusted Median Rent'].mean()

plt.plot(saginaw_county_mean_rent, label="Saginaw County")
plt.plot(oakland_county_mean_rent, label="Oakland County")
plt.plot(kent_county_mean_rent, label="Kent County")
plt.legend(loc=(1.01,0.01))
plt.xlabel('Year')
plt.ylabel('Adjusted Median Rent')
plt.title('Average County Rent (2011-2020)')
plt.xlim(2011,2020)

saginaw_county_mean_rent = pd.DataFrame(saginaw_county_mean_rent).reset_index()
oakland_county_mean_rent = pd.DataFrame(oakland_county_mean_rent).reset_index()
kent_county_mean_rent = pd.DataFrame(kent_county_mean_rent).reset_index()

county_mean_rent = pd.merge(saginaw_county_mean_rent, oakland_county_mean_rent, how='outer', on='Year')
county_mean_rent = pd.merge(county_mean_rent, kent_county_mean_rent, how='outer', on='Year')
county_mean_rent.rename(columns={'Adjusted Median Rent_x': 'Saginaw Average Rent', 'Adjusted Median Rent_y': 'Oakland Average Rent', 'Adjusted Median Rent': 'Kent Average Rent'})

# Should this also have a line of regression? Or I guess, is it possible to add one?

#(slope, intercept, rvalue, pvalue, stderr) = linregress(county_mean_rent['Year'], **y values**)
#regress_values = county_mean_rent['Year'] * slope + intercept
#line_eq = 'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2))
#plt.plot(county_mean_rent['Year'], regress_values, 'r-')
#plt.annotate(line_eq, (2011, 500), fontsize= 12, color= 'red')
#print(f'The r-value is: {rvalue}')
#plt.savefig("output_data/county_average_rent")

# Demographic Composition in Each County, by Zipcode **Rhi**

In [None]:
# Create histogram for population distribution across zipcodes
fig = plt.figure(figsize = (8,8))
ax1 = fig.add_subplot(311)
ax1.hist(oakland_data["Total Population"],color="green")
ax2 = fig.add_subplot(312,sharex=ax1, sharey=ax1)
ax2.hist(saginaw_data["Total Population"],color="blue")
ax3 = fig.add_subplot(313,sharex=ax1, sharey=ax1)
ax3.hist(kent_data["Total Population"],color="red")
ax1.text(.5,140,"Oakland County Population")
ax2.text(.5,140,"Saginaw County Population")
ax3.text(.5,140,"Kent County Population")
for ax in fig.get_axes():
    ax.set(xlabel="Population",ylabel="Count")
    ax.label_outer()
fig.suptitle("Population counts across zipcodes")
plt.savefig("output_data/overall_populations_across_counties")
plt.show()

##  Oakland County Analyses

In [None]:
# Stacked bar chart for population by county

fig = plt.figure(figsize = (15,8))
xaxis = np.arange(0,len(oakland_grouped.index))
label = (oakland_grouped.index.tolist())
line1 = plt.bar(xaxis,oakland_grouped["White Population"],color='c',label="")
plt.xticks(ticks=xaxis,labels=label,rotation=90)
line3 = plt.bar(xaxis,(oakland_grouped["Total Population"]-oakland_grouped["Black Population"]-oakland_grouped["White Population"]),
                       label=label,bottom=oakland_grouped["White Population"],color='r')
line2 = plt.bar(xaxis,oakland_grouped["Black Population"],label=label,bottom=oakland_grouped["White Population"],color='b')
plt.legend([line1, line2,line3],["White","Black","Other"])
plt.xlabel("Zipcode")
plt.ylabel("Population")
plt.title("Median Populations in Oakland County from 2011-2020")
plt.savefig("output_data/populations_Oakland.png")
plt.show()


In [None]:
# Stacked bar chart for poverty rates
fig = plt.figure(figsize = (15,8))
xaxis = np.arange(0,len(oakland_grouped.index))
label = (oakland_grouped.index.tolist())
line1 = plt.bar(xaxis,oakland_grouped["Poverty Count - White"],color='c',label="")
plt.xticks(ticks=xaxis,labels=label,rotation=90)
line3 = plt.bar(xaxis,(oakland_grouped["Poverty Count"]-oakland_grouped["Poverty Count - Black"]-oakland_grouped["Poverty Count - White"]),
                       label=label,bottom=oakland_grouped["Poverty Count - White"],color='r')
line2 = plt.bar(xaxis,oakland_grouped["Poverty Count - Black"],label=label,bottom=oakland_grouped["Poverty Count - White"],color='b')
plt.legend([line1, line2,line3],["White","Black","Other"])
plt.xlabel("Zipcode")
plt.ylabel("Individuals in Poverty")
plt.title("Median Poverty Counts in Oakland County from 2011-2020")
plt.savefig("output_data/poverty_Oakland.png")
plt.show()

In [None]:
# Stacked bar chart for poverty %s
fig = plt.figure(figsize = (15,8))
xaxis = np.arange(0,len(oakland_grouped.index))
label = (oakland_grouped.index.tolist())
line1 = plt.bar(xaxis,oakland_grouped["% White Poverty"],color='c',label="")
plt.xticks(ticks=xaxis,labels=label,rotation=90)
line3 = plt.bar(xaxis,(oakland_grouped["% Poverty"]-oakland_grouped["% Black Poverty"]-oakland_grouped["% White Poverty"]),
                       label=label,bottom=oakland_grouped["% White Poverty"],color='r')
line2 = plt.bar(xaxis,oakland_grouped["% Black Poverty"],label=label,bottom=oakland_grouped["% White Poverty"],color='b')
plt.legend([line1, line2,line3],["White","Black","Other"])
plt.xlabel("Zipcode")
plt.ylabel("% of Population in Poverty")
plt.title("Median Poverty Rates in Oakland County from 2011-2020")
plt.savefig("output_data/poverty_perc_Oakland.png")
plt.show()

In [None]:
def time_analyses_stacked(df,col1,col2,col_total,labels,county):
    for zipcode in df["Zipcode"].unique():
        max_y = 1.1*(df[col_total].max())
        filtered = df.loc[df["Zipcode"] == zipcode]
        filtered = filtered.sort_values(by=["Zipcode","Year"])
        filtered["Other"] = filtered[col_total]-filtered[col1]-filtered[col2]
        
        fig = plt.figure(figsize = (15,8))
        line1 = plt.bar(filtered["Year"],filtered[col2],color='c')
        line3 = plt.bar(filtered["Year"],filtered["Other"],bottom=filtered[col2],color='r')
        line2 = plt.bar(filtered["Year"],filtered[col1],bottom=filtered[col2],color='b')
        plt.legend([line1, line2,line3],labels)
        plt.xlabel("Year")
        plt.ylabel(col_total)
        ax.set(ylim = [0, max_y])
        plt.title(col_total + " in "  + str(zipcode) + ", " + county + " County from 2011-2020")
        plt.savefig(f"output_data/zipcode_graphs/{county}_{col_total}_{str(zipcode)}")
        plt.show()

In [None]:
# Time analyses for poverty
fig = plt.figure(figsize = (7,4))
time_analyses_stacked(oakland_data,"Poverty Count - White","Poverty Count - Black", 
                      "Poverty Count",["White","Black","Other"],"Oakland")

In [None]:
# Time analyses for population
time_analyses_stacked(oakland_data,"White Population","Black Population", 
                      "Total Population",["White","Black","Other"],"Oakland")

In [None]:
# Create initial plot
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in oakland_data["Zipcode"].unique():
    filtered = oakland_data.loc[oakland_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["Total Population"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values.append(plt_r)
    zipcodes.append(zipcode)
    slopes.append(plt_slope)
    y_ints.append(plt_int)
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints})
equations_df = equations_df.loc[(abs(equations_df["R"]) > .7) & (abs(equations_df["Slope"])> 300)]
equations_df = pd.merge(equations_df,oakland_data,on="Zipcode",how="left")
equations_df = equations_df.sort_values(by=["Zipcode","Year"])
ann = 34000
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["Total Population"]
    plt.plot(x,y)
    plt.legend(equations_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1200
plt.title("Population in Oakland County")
plt.xlabel("Year")
plt.ylabel("Individuals")
plt.savefig("output_data/linregress_population_Oakland.png")
plt.show()

In [None]:
# Create initial plot ***Rent
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in oakland_data["Zipcode"].unique():
    filtered = oakland_data.loc[oakland_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["Adjusted Median Rent"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values.append(plt_r)
    zipcodes.append(zipcode)
    slopes.append(plt_slope)
    y_ints.append(plt_int)
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints})
equations_df = equations_df.loc[(abs(equations_df["R"]) > .9) & (abs(equations_df["Slope"])> 38)]
equations_df = pd.merge(equations_df,oakland_data,on="Zipcode",how="left")
equations_df = equations_df.sort_values(by=["Zipcode","Year"])
ann = 1000
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["Adjusted Median Rent"]
    plt.plot(x,y)
    plt.legend(equations_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 50
plt.title("Adjusted Median Rent in Oakland County")
plt.xlabel("Year")
plt.ylabel("Rent")
plt.savefig("output_data/restricted_rent_Oakland.png")
plt.show()

In [None]:
# Create initial plot
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values2=[]
zipcodes2=[]
slopes2=[]
y_ints2= []
# Calculate regression equations for all zipcodes
for zipcode in oakland_data["Zipcode"].unique():
    filtered = oakland_data.loc[oakland_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["Poverty Count"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values2.append(plt_r)
    zipcodes2.append(zipcode)
    slopes2.append(plt_slope)
    y_ints2.append(plt_int)

# Limit the number of zipcodes to those with the most change and plot those
equations2_df = pd.DataFrame({"Zipcode":zipcodes2,"R":r_values2,"Slope":slopes2,"Y-intercept":y_ints2})
equations2_df = equations2_df.loc[(abs(equations2_df["R"]) > .9) & (abs(equations2_df["Slope"])> 50)]
equations2_df = pd.merge(equations2_df,oakland_data,on="Zipcode",how="left")
equations2_df = equations2_df.sort_values(by=["Zipcode","Year"])
# equations_df
ann = 4000
for zipcode in equations2_df["Zipcode"].unique():
    filtered_list = equations2_df.loc[equations2_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["Poverty Count"]
    plt.plot(x,y)
    plt.legend(equations2_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 200
plt.title("Poverty Counts in Oakland County")
plt.xlabel("Year")
plt.ylabel("Individuals in Poverty")
plt.savefig("output_data/linregress_poverty_Oakland.png")
plt.show()

In [None]:
# Create initial plot
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values2=[]
zipcodes2=[]
slopes2=[]
y_ints2= []
# Calculate regression equations for all zipcodes
for zipcode in oakland_data["Zipcode"].unique():
    filtered = oakland_data.loc[oakland_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["% Poverty"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values2.append(plt_r)
    zipcodes2.append(zipcode)
    slopes2.append(plt_slope)
    y_ints2.append(plt_int)

# Limit the number of zipcodes to those with the most change and plot those
equations2_df = pd.DataFrame({"Zipcode":zipcodes2,"R":r_values2,"Slope":slopes2,"Y-intercept":y_ints2})
equations2_df = equations2_df.loc[(abs(equations2_df["R"]) > .9) & (abs(equations2_df["Slope"])> .4)]
equations2_df = pd.merge(equations2_df,oakland_data,on="Zipcode",how="left")
equations2_df = equations2_df.sort_values(by=["Zipcode","Year"])
# equations_df
ann = 22
for zipcode in equations2_df["Zipcode"].unique():
    filtered_list = equations2_df.loc[equations2_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["% Poverty"]
    plt.plot(x,y)
    plt.legend(equations2_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1.5
plt.xlim(2011,2020)
plt.ylim(0,30)
plt.title("Poverty Rates in Oakland County")
plt.xlabel("Year")
plt.ylabel("% Population in Poverty")
plt.savefig("output_data/linregress_poverty_percent_Oakland.png")
plt.show()

In [None]:
# Create initial plot
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in oakland_data["Zipcode"].unique():
    filtered = oakland_data.loc[oakland_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["% Unemployed"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values.append(plt_r)
    zipcodes.append(zipcode)
    slopes.append(plt_slope)
    y_ints.append(plt_int)
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints})
equations_df = equations_df.loc[(abs(equations_df["R"]) > .9) & (abs(equations_df["Slope"])> .75)]
equations_df = pd.merge(equations_df,oakland_data,on="Zipcode",how="left")
equations_df = equations_df.sort_values(by=["Zipcode","Year"])
ann = 8
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["% Unemployed"]
    plt.plot(x,y)
    plt.legend(equations_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1
plt.title("Unemployment Rate in Oakland County")
plt.xlabel("Year")
plt.ylabel("% Unemployed")
plt.savefig("output_data/linregress_unemployment_Oakland.png")
plt.show()

## Kent County Analyses

In [None]:
# Stacked bar chart for population by county

fig = plt.figure(figsize = (15,8))
xaxis = np.arange(0,len(kent_grouped.index))
label = (kent_grouped.index.tolist())
line1 = plt.bar(xaxis,kent_grouped["White Population"],color='c',label="")
plt.xticks(ticks=xaxis,labels=label,rotation=90)
line3 = plt.bar(xaxis,(kent_grouped["Total Population"]-kent_grouped["Black Population"]-kent_grouped["White Population"]),
                       label=label,bottom=kent_grouped["White Population"],color='r')
line2 = plt.bar(xaxis,kent_grouped["Black Population"],label=label,bottom=kent_grouped["White Population"],color='b')
plt.legend([line1, line2,line3],["White","Black","Other"])
plt.xlabel("Zipcode")
plt.ylabel("Population")
plt.title("Median Populations in Kent County from 2011-2020")
plt.savefig("output_data/populations_Kent.png")
plt.show()


In [None]:
# Stacked bar chart for poverty rates
fig = plt.figure(figsize = (15,8))
xaxis = np.arange(0,len(kent_grouped.index))
label = (kent_grouped.index.tolist())
line1 = plt.bar(xaxis,kent_grouped["Poverty Count - White"],color='c',label="")
plt.xticks(ticks=xaxis,labels=label,rotation=90)
line3 = plt.bar(xaxis,(kent_grouped["Poverty Count"]-kent_grouped["Poverty Count - Black"]-kent_grouped["Poverty Count - White"]),
                       label=label,bottom=kent_grouped["Poverty Count - White"],color='r')
line2 = plt.bar(xaxis,kent_grouped["Poverty Count - Black"],label=label,bottom=kent_grouped["Poverty Count - White"],color='b')
plt.legend([line1, line2,line3],["White","Black","Other"])
plt.xlabel("Zipcode")
plt.ylabel("Individuals in Poverty")
plt.title("Median Poverty Counts in Kent County from 2011-2020")
plt.savefig("output_data/poverty_Kent.png")
plt.show()

In [None]:
# Time analyses for poverty
fig = plt.figure(figsize = (7,4))
time_analyses_stacked(kent_data,"Poverty Count - White","Poverty Count - Black", 
                      "Poverty Count",["White","Black","Other"],"Kent")

In [None]:
# Time analyses for population
time_analyses_stacked(kent_data,"White Population","Black Population", 
                      "Total Population",["White","Black","Other"],"Kent")

In [None]:
# Create initial plot ---Population
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in kent_data["Zipcode"].unique():
    filtered = kent_data.loc[kent_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["Total Population"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values.append(plt_r)
    zipcodes.append(zipcode)
    slopes.append(plt_slope)
    y_ints.append(plt_int)
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints})
equations_df = equations_df.loc[(abs(equations_df["R"]) > .7) & (abs(equations_df["Slope"])> 400)]
equations_df = pd.merge(equations_df,kent_data,on="Zipcode",how="left")
equations_df = equations_df.sort_values(by=["Zipcode","Year"])
ann = 27500
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["Total Population"]
    plt.plot(x,y)
    plt.legend(equations_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1200
plt.title("Population in Kent County")
plt.xlabel("Year")
plt.ylabel("Individuals")
plt.savefig("output_data/linregress_population_Kent.png")
plt.show()

In [None]:
# Create initial plot ---Poverty
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values2=[]
zipcodes2=[]
slopes2=[]
y_ints2= []
# Calculate regression equations for all zipcodes
for zipcode in kent_data["Zipcode"].unique():
    filtered = kent_data.loc[kent_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["Poverty Count"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values2.append(plt_r)
    zipcodes2.append(zipcode)
    slopes2.append(plt_slope)
    y_ints2.append(plt_int)

# Limit the number of zipcodes to those with the most change and plot those
equations2_df = pd.DataFrame({"Zipcode":zipcodes2,"R":r_values2,"Slope":slopes2,"Y-intercept":y_ints2})
equations2_df = equations2_df.loc[(abs(equations2_df["R"]) > .85) & (abs(equations2_df["Slope"])> 40)]
equations2_df = pd.merge(equations2_df,kent_data,on="Zipcode",how="left")
equations2_df = equations2_df.sort_values(by=["Zipcode","Year"])
# equations_df
ann = 6000
for zipcode in equations2_df["Zipcode"].unique():
    filtered_list = equations2_df.loc[equations2_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["Poverty Count"]
    plt.plot(x,y)
    plt.legend(equations2_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 350
plt.title("Poverty Counts in Kent County")
plt.xlabel("Year")
plt.ylabel("Individuals in Poverty")
plt.savefig("output_data/linregress_poverty_Kent.png")
plt.show()

In [None]:
# Create initial plot
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values2=[]
zipcodes2=[]
slopes2=[]
y_ints2= []
# Calculate regression equations for all zipcodes
for zipcode in kent_data["Zipcode"].unique():
    filtered = kent_data.loc[kent_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["% Poverty"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values2.append(plt_r)
    zipcodes2.append(zipcode)
    slopes2.append(plt_slope)
    y_ints2.append(plt_int)

# Limit the number of zipcodes to those with the most change and plot those
equations2_df = pd.DataFrame({"Zipcode":zipcodes2,"R":r_values2,"Slope":slopes2,"Y-intercept":y_ints2})
equations2_df = equations2_df.loc[(abs(equations2_df["R"]) > .9) & (abs(equations2_df["Slope"])> .4)]
equations2_df = pd.merge(equations2_df,kent_data,on="Zipcode",how="left")
equations2_df = equations2_df.sort_values(by=["Zipcode","Year"])
# equations_df
ann = 22
for zipcode in equations2_df["Zipcode"].unique():
    filtered_list = equations2_df.loc[equations2_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["% Poverty"]
    plt.plot(x,y)
    plt.legend(equations2_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1.5
plt.xlim(2011,2020)
plt.ylim(0,30)
plt.title("Poverty Rates in Kent County")
plt.xlabel("Year")
plt.ylabel("% Population in Poverty")
plt.savefig("output_data/linregress_poverty_percent_Kent.png")
plt.show()

In [None]:
# Create initial plot --Unemployment
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in kent_data["Zipcode"].unique():
    filtered = kent_data.loc[kent_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["% Unemployed"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values.append(plt_r)
    zipcodes.append(zipcode)
    slopes.append(plt_slope)
    y_ints.append(plt_int)
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints})
equations_df = equations_df.loc[(abs(equations_df["R"]) > .7) & (abs(equations_df["Slope"])> .58)]
equations_df = pd.merge(equations_df,kent_data,on="Zipcode",how="left")
equations_df = equations_df.sort_values(by=["Zipcode","Year"])
ann = 8
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["% Unemployed"]
    plt.plot(x,y)
    plt.legend(equations_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1
plt.title("Unemployment Rate in Kent County")
plt.xlabel("Year")
plt.ylabel("% Unemployed")
plt.savefig("output_data/linregress_unemployment_Kent.png")
plt.show()

## Saginaw County Analyses

In [None]:
# Stacked bar chart for population by county

fig = plt.figure(figsize = (15,8))
xaxis = np.arange(0,len(saginaw_grouped.index))
label = (saginaw_grouped.index.tolist())
line1 = plt.bar(xaxis,saginaw_grouped["White Population"],color='c',label="")
plt.xticks(ticks=xaxis,labels=label,rotation=90)
line3 = plt.bar(xaxis,(saginaw_grouped["Total Population"]-saginaw_grouped["Black Population"]-saginaw_grouped["White Population"]),
                       label=label,bottom=saginaw_grouped["White Population"],color='r')
line2 = plt.bar(xaxis,saginaw_grouped["Black Population"],label=label,bottom=saginaw_grouped["White Population"],color='b')
plt.legend([line1, line2,line3],["White","Black","Other"])
plt.xlabel("Zipcode")
plt.ylabel("Population")
plt.title("Median Populations in Saginaw County from 2011-2020")
plt.savefig("output_data/populations_Saginaw.png")
plt.show()

In [None]:
# Stacked bar chart for poverty rates
fig = plt.figure(figsize = (15,8))
xaxis = np.arange(0,len(saginaw_grouped.index))
label = (saginaw_grouped.index.tolist())
line1 = plt.bar(xaxis,saginaw_grouped["Poverty Count - White"],color='c',label="")
plt.xticks(ticks=xaxis,labels=label,rotation=90)
line3 = plt.bar(xaxis,(saginaw_grouped["Poverty Count"]-saginaw_grouped["Poverty Count - Black"]-saginaw_grouped["Poverty Count - White"]),
                       label=label,bottom=saginaw_grouped["Poverty Count - White"],color='r')
line2 = plt.bar(xaxis,saginaw_grouped["Poverty Count - Black"],label=label,bottom=saginaw_grouped["Poverty Count - White"],color='b')
plt.legend([line1, line2,line3],["White","Black","Other"])
plt.xlabel("Zipcode")
plt.ylabel("Individuals in Poverty")
plt.title("Median Poverty Counts in Saginaw County from 2011-2020")
plt.savefig("output_data/poverty_Saginaw.png")
plt.show()

In [None]:
# Time analyses for poverty
fig = plt.figure(figsize = (7,4))
time_analyses_stacked(saginaw_data,"Poverty Count - White","Poverty Count - Black", 
                      "Poverty Count",["White","Black","Other"],"Saginaw")

In [None]:
# Time analyses for population
time_analyses_stacked(saginaw_data,"White Population","Black Population", 
                      "Total Population",["White","Black","Other"],"Saginaw")

In [None]:
# Create initial plot ---Population
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in saginaw_data["Zipcode"].unique():
    filtered = saginaw_data.loc[saginaw_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["Total Population"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values.append(plt_r)
    zipcodes.append(zipcode)
    slopes.append(plt_slope)
    y_ints.append(plt_int)
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints})
equations_df = equations_df.loc[(abs(equations_df["R"]) > .8) & (abs(equations_df["Slope"])> 80)]
equations_df = pd.merge(equations_df,saginaw_data,on="Zipcode",how="left")
equations_df = equations_df.sort_values(by=["Zipcode","Year"])
ann = 27500
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["Total Population"]
    plt.plot(x,y)
    plt.legend(equations_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1200
plt.title("Population in Saginaw County")
plt.xlabel("Year")
plt.ylabel("Individuals")
plt.savefig("output_data/linregress_population_Saginaw.png")
plt.show()

In [None]:
# Create initial plot ---Poverty
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values2=[]
zipcodes2=[]
slopes2=[]
y_ints2= []
# Calculate regression equations for all zipcodes
for zipcode in saginaw_data["Zipcode"].unique():
    filtered = saginaw_data.loc[saginaw_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["Poverty Count"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values2.append(plt_r)
    zipcodes2.append(zipcode)
    slopes2.append(plt_slope)
    y_ints2.append(plt_int)

# Limit the number of zipcodes to those with the most change and plot those
equations2_df = pd.DataFrame({"Zipcode":zipcodes2,"R":r_values2,"Slope":slopes2,"Y-intercept":y_ints2})
equations2_df = equations2_df.loc[(abs(equations2_df["R"]) > .7) & (abs(equations2_df["Slope"])> 25)]
equations2_df = pd.merge(equations2_df,saginaw_data,on="Zipcode",how="left")
equations2_df = equations2_df.sort_values(by=["Zipcode","Year"])
# equations_df
ann = 6000
for zipcode in equations2_df["Zipcode"].unique():
    filtered_list = equations2_df.loc[equations2_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["Poverty Count"]
    plt.plot(x,y)
    plt.legend(equations2_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 400
plt.title("Poverty Rates in Saginaw County")
plt.xlabel("Year")
plt.ylabel("Individuals in Poverty")
plt.savefig("output_data/linregress_poverty_Saginaw.png")
plt.show()

In [None]:
# Create initial plot
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values2=[]
zipcodes2=[]
slopes2=[]
y_ints2= []
# Calculate regression equations for all zipcodes
for zipcode in saginaw_data["Zipcode"].unique():
    filtered = saginaw_data.loc[saginaw_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["% Poverty"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values2.append(plt_r)
    zipcodes2.append(zipcode)
    slopes2.append(plt_slope)
    y_ints2.append(plt_int)

# Limit the number of zipcodes to those with the most change and plot those
equations2_df = pd.DataFrame({"Zipcode":zipcodes2,"R":r_values2,"Slope":slopes2,"Y-intercept":y_ints2})
equations2_df = equations2_df.loc[(abs(equations2_df["R"]) > .8) & (abs(equations2_df["Slope"])> .3)]
equations2_df = pd.merge(equations2_df,saginaw_data,on="Zipcode",how="left")
equations2_df = equations2_df.sort_values(by=["Zipcode","Year"])
# equations_df
ann = 22
for zipcode in equations2_df["Zipcode"].unique():
    filtered_list = equations2_df.loc[equations2_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["% Poverty"]
    plt.plot(x,y)
    plt.legend(equations2_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1.5
plt.xlim(2011,2020)
plt.title("Poverty Rates in Saginaw County")
plt.xlabel("Year")
plt.ylabel("% Population in Poverty")
plt.savefig("output_data/linregress_poverty_percent_saginaw.png")
plt.show()

In [None]:
# Create initial plot --Unemployment
fig = plt.figure(figsize = (15,8))
# Blank lists for different values
r_values=[]
zipcodes=[]
slopes=[]
y_ints= []
# Calculate regression equations for all zipcodes
for zipcode in saginaw_data["Zipcode"].unique():
    filtered = saginaw_data.loc[saginaw_data["Zipcode"] == zipcode]
    filtered = filtered.sort_values(by=["Zipcode","Year"])
    x = filtered["Year"]
    y = filtered["% Unemployed"]
    
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    r_values.append(plt_r)
    zipcodes.append(zipcode)
    slopes.append(plt_slope)
    y_ints.append(plt_int)
# Limit the number of zipcodes to those with the most change and plot those
equations_df = pd.DataFrame({"Zipcode":zipcodes,"R":r_values,"Slope":slopes,"Y-intercept":y_ints})
equations_df = equations_df.loc[(abs(equations_df["R"]) > .7) & (abs(equations_df["Slope"])> .55)]
equations_df = pd.merge(equations_df,saginaw_data,on="Zipcode",how="left")
equations_df = equations_df.sort_values(by=["Zipcode","Year"])
ann = 8
for zipcode in equations_df["Zipcode"].unique():
    filtered_list = equations_df.loc[equations_df["Zipcode"]== zipcode]
    x = filtered_list["Year"]
    y = filtered_list["% Unemployed"]
    plt.plot(x,y)
    plt.legend(equations_df["Zipcode"].unique(),loc=1)    
    plt.annotate(f"{zipcode}: y={round((filtered_list['Slope'].mean()),2)}x + {round((filtered_list['Y-intercept'].mean()),2)} R = {round((filtered_list['R'].mean()),2)}", xy=(2019,ann))
    ann -= 1
plt.title("Unemployment Rate in Saginaw County")
plt.xlabel("Year")
plt.ylabel("% Unemployed")
plt.savefig("output_data/linregress_unemployment_Saginaw.png")
plt.show()

In [None]:
# 48348 Analysis
zip48348_df = oakland_data.loc[oakland_data["Zipcode"]==48348].sort_values(by="Year")
zip48348_df = zip48348_df.drop(columns=["Zipcode","White Population","Black Population","Employed Count","Unemployed Count",
                                        "Poverty Count","Poverty Count - White","Poverty Count - Black","County"])
zip48348_df = zip48348_df.reset_index(drop=True)

#48360 Analysis
zip48360_df = oakland_data.loc[oakland_data["Zipcode"]==48360].sort_values(by="Year")
zip48360_df = zip48360_df.drop(columns=["Zipcode","White Population","Black Population","Employed Count","Unemployed Count",
                                        "Poverty Count","Poverty Count - White","Poverty Count - Black","County"])
zip48360_df = zip48360_df.reset_index(drop=True)

In [None]:
#Formula for Zipcode Regressions
def lin_regress(x,y,x_label,y_label,loc):
    plt_slope,plt_int,plt_r,plt_p,plt_std_err = linregress(x,y)
    fit_values = plt_slope * x + plt_int
    fig,ax = plt.subplots(figsize = (10,6))
    ax.plot(x,y,color="red")
    ax.plot(x,fit_values,color="gray")
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
#     ax.annotate((f"y = {str(round(plt_slope,2))}x + {str(round(plt_int,2))}"),loc,fontsize=15,color="red")
    plt.show()

In [None]:
for column in zip48348_df:
    x = zip48348_df["Year"]
    y = zip48348_df[column]
    x_label = "Year"
    y_label = column
    loc = (2020,0)
    lin_regress(x,y,x_label,y_label,loc)

In [None]:
for column in zip48360_df:
    x = zip48360_df["Year"]
    y = zip48360_df[column]
    x_label = "Year"
    y_label = column
    loc = (2020,0)
    lin_regress(x,y,x_label,y_label,loc)