In [2]:
import fuzzy_pandas as fp
import pandas as pd
import os
import xlrd
import numpy as np
from sklearn.impute import SimpleImputer

In [3]:
census = pd.read_csv("census_data_final - census_data_final.csv")
black_rep = pd.read_csv("Black_representation - Sheet 1.csv")
variable_coder = pd.read_excel("VARIABLE for coder X.xlsx")


In [4]:
variable_coder.head(5)

Unnamed: 0,STATE,GISJOIN,STATEICP,STATEFIPS,COUNTYFIPS,ALLCOUNTIES,VARIABLE NAME
0,Alabama,G0100010,41.0,1,10,Autauga,
1,Alabama,G0100030,41.0,1,30,Baldwin,
2,Alabama,G0100050,41.0,1,50,Barbour,
3,Alabama,G0100070,41.0,1,70,Bibb,
4,Alabama,G0100090,41.0,1,90,Blount,


In [5]:
black_rep.loc[len(black_rep.index)] = ['District Of Columbia', np.nan, np.nan,np.nan,np.nan,np.nan,np.nan,np.nan,1/7]
black_rep = black_rep.sort_values(by = "State")
black_rep.tail(5)

Unnamed: 0,State,total_reps,senate,representatives,*Number of justices*,attorney_general,governor,count,percent_black_rep
45,Virginia,22.0,2.0,11.0,7.0,1.0,1.0,1.0,0.045455
46,Washington,23.0,2.0,10.0,9.0,1.0,1.0,0.0,0.0
47,West Virginia,11.0,2.0,2.0,5.0,1.0,1.0,0.0,0.0
48,Wisconsin,19.0,2.0,8.0,7.0,1.0,1.0,0.0,0.0
49,Wyoming,4.0,2.0,1.0,0.0,0.0,1.0,0.0,0.0


In [7]:
census.head(5)
census.loc[census['STNAME'] == 'District of Columbia']

Unnamed: 0.1,Unnamed: 0,STNAME,CTYNAME,black_population,total_population,cty_percentage
320,321,District of Columbia,District of Columbia,7759064,16151082,0.480405


This is actually much easier than I imagined at first; if the variable is just 0 or 1, I can create a column with 0's or 1's for each state, then join that with the variable coder spreadsheet based on state name 

In [6]:
percentage_rep = black_rep[['State', 'percent_black_rep']]
percentage_rep.head(5)

Unnamed: 0,State,percent_black_rep
0,Alabama,0.05
1,Alaska,0.0
2,Arizona,0.0
3,Arkansas,0.0
4,California,0.053571


In [7]:
census_agg = pd.DataFrame(census.groupby('STNAME')['cty_percentage'].mean())
census_agg.head(5)

Unnamed: 0_level_0,cty_percentage
STNAME,Unnamed: 1_level_1
Alabama,0.208823
Alaska,0.017298
Arizona,0.022668
Arkansas,0.143197
California,0.044039


In [8]:
final_df = percentage_rep.join(census_agg, how = "left", on = "State")
final_df['GR.LRA3'] = 99 # just filling in a random number to make sure it turns out how I want it to

for row in range(len(final_df)):
    cty = final_df.loc[row,].cty_percentage
    perc = final_df.loc[row,].percent_black_rep
    if perc > cty:
        final_df.loc[row,'GR.LRA3'] = 0
    else: 
        final_df.loc[row,'GR.LRA3'] = 1


print(f"There are {final_df['GR.LRA3'].sum()} total states where the percentage of the Black population is greater than the percentage of Black representation at the state level. In other words, this is the total number of States where Black representation is insufficient by our metrics")        

There are 38 total states where the percentage of the Black population is greater than the percentage of Black representation at the state level. In other words, this is the total number of States where Black representation is insufficient by our metrics


In [9]:
variable_coder = variable_coder.rename(columns = {'VARIABLE NAME':'GR.LRA3'})
for state in final_df.State:
    rep = final_df.loc[final_df['State'] == state, 'GR.LRA3'].values[0]
    print(f"The current state is {state} and the GR value is {rep}")
    variable_coder.loc[variable_coder['STATE'] == state, 'GR.LRA3'] = rep
    

The current state is Alabama and the GR value is 1
The current state is Alaska and the GR value is 1
The current state is Arizona and the GR value is 1
The current state is Arkansas and the GR value is 1
The current state is California and the GR value is 0
The current state is Colorado and the GR value is 0
The current state is Connecticut and the GR value is 0
The current state is Delaware and the GR value is 0
The current state is District Of Columbia and the GR value is 1
The current state is Florida and the GR value is 1
The current state is Georgia and the GR value is 1
The current state is Hawaii and the GR value is 1
The current state is Idaho and the GR value is 1
The current state is Illinois and the GR value is 0
The current state is Indiana and the GR value is 1
The current state is Iowa and the GR value is 1
The current state is Kansas and the GR value is 1
The current state is Kentucky and the GR value is 1
The current state is Louisiana and the GR value is 1
The current 

<p> The extra states/territories are Alaska Territory, Puerto Rico, and Washington DC; I'm not too sure what to do with them so I'll just remove them from now and then tell Dr. AJ later</p>

In [11]:
assert variable_coder['GR.LRA3'].isna().sum() == len(variable_coder.loc[variable_coder['STATE'] == "Alaska Territory"])+len(variable_coder.loc[variable_coder['STATE'] == "Puerto Rico"]) #+len(variable_coder.loc[variable_coder['STATE'] == "District Of Columbia"])

^^ Confirming that all of our na's are from PR or Alaska (already took care of DC above)

Next steps: 

* Impute missing data for rows where our state is PR, Alaska and DC 
* Find a way to join the percentage of black population by county to the variable coder spreadsheet 
  * Idea: Take the state and county column from the variable coder spreadsheet and do a left join (with it on the inside) with the cty_percentage dataframe. Any rows with missing info were not in the cty_percentage dataframe so I can just impute those with the average from the state itself. 
  * I'm not too sure what to do with the entire states/territories that weren't in our percentage dataframe; 
    * Maybe impute with the national avg? 
    

For the Alaska Territories; I imputed the GR.LRA3 variable using the value for "Alaska". To keep things consistent, 
I think I also need to impute the percentage of the Black population using the mean for all counties in Alaska

In [12]:
variable_coder.loc[variable_coder['STATE'] == "Alaska Territory", 'GR.LRA3'] = variable_coder.loc[variable_coder['STATE'] == "Alaska", 'GR.LRA3'].values[0]

<p> I actually have no idea what to do for Puerto Rico so I will ask in the meeting tomorrow; PR does not have the same state elected positions that all of the other states and territories have so it probably doesn't make sense to just assign it one way or another. I could just impute it with the mode of the column, but I want to double check. 
</p>


<p>
    I also don't really know what to do about the Washington DC stuff; I could theoretically look through each of the eligible elected positions and then see the percentage of Black folks for DC. It probably won't be too bad so I'll just do this. 
</p>

* Congress (house and senate);  5 congress and 0 senators 
  * 0 Black
* State governor; Questionable because they only have a mayor, will not give it to them based on this *https://statehood.dc.gov/page/dc-governance#:~:text=Like%20a%20governor%2C%20the%20Mayor,before%20a%20law%20takes%20effect.* Lack of autonomy section
* State supreme court justices (only those that are elected); appointed by the president 
* State attorney general; they have an attorney general 
  * 1 Black, 2 possible during the time span 
  
This means DC is 100(1/7)% Black representation. I will manually add this to the dataframe (done above)

In [13]:
variable_coder.loc[variable_coder['STATE'] == "Puerto Rico", 'GR.LRA3'] = variable_coder.loc[variable_coder['STATE'] == "Alaska", 'GR.LRA3'].values[0]

In [14]:
variable_coder.loc[variable_coder['STATE'] == "Puerto Rico", 'GR.LRA3'] = variable_coder['GR.LRA3'].mode().values[0]



In [15]:
variable_coder.loc[variable_coder['GR.LRA3'].isna() == True]

Unnamed: 0,STATE,GISJOIN,STATEICP,STATEFIPS,COUNTYFIPS,ALLCOUNTIES,GR.LRA3


In [17]:
variable_coder.to_csv("final_variable_coder.csv")