In [1]:
import pandas as pd
import numpy as np
import camelot # Used to read pdfs and vonvert them to tables. documentation: https://camelot-py.readthedocs.io/en/master/

# About original data

The Legislative composition data for years 2017-2023 comes from the National Conferance of State Legislatures [website](https://www.ncsl.org/about-state-legislatures/state-partisan-composition). The data has information on the party composition of state legislators for each state. It also has the party for each governor. This data needed some preprocessing to convert it into the wanted format of csv files as it is stored in pdf files. This codebook converts the pdf data into csv data that will be usable for our purposes. As this data comes directly from the NCSL, it seems to be pretty reliable.

# Reading in pdfs and converting to csv

In [3]:
#Read pdf as csv for the years 2017 through 2023
for year in range(2017,2024):
    table = camelot.read_pdf(f'Data/Legislative_Control_Data/Control_pdfs/Legis_Control_{year}.pdf')
    table.export(f'Data/Legislative_Control_Data/Control_readin_csvs/Legis_Control_{year}.csv', f='csv')

# Data wrangling for legislative control years 2017 to 2023

I thought I would have data up to 2023, this is why I made these extra years for the state legislative control. I am leaving them in this repository because they may be of use for future study.

## 2017 Data

In [4]:
#Read in 2017 data
legis_control_2017 = pd.read_csv('Data/Legislative_Control_Data/Control_readin_csvs/Legis_Control_2017-page-1-table-1.csv')
legis_control_2017.head()

Unnamed: 0,STATE,Total \nSeats,Total \nSenate \nSenate \nSenate \nSenate\nDem.\nRep.\nother,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Total \nHouse \nHouse \nHouse \nHouse\nDem.\nRep.\nother,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Legis. \nControl,Gov. \nParty,State \nControl
0,Alabama\nAlaska*\nArizona\nArkansas\nCalifornia,140\n60\n90\n135\n120\n100\n187\n62\n160\n236\...,35\n20\n30\n35\n40\n35\n36\n21\n40\n56\n25\n35...,8,26,1.0,,105,32,72,1v,,Rep,Rep,Rep
1,,,,6,14,,,40,17,21,2,,Rep*,Ind,Divided
2,,,,13,17,,,60,25,35,,,Rep,Rep,Rep
3,,,,9,26,,,100,24,76,,,Rep,Rep,Rep
4,,,,27\n17,13\n18,,,80\n65,55\n37,25\n28,,,Dem\nSplit,Dem\nDem,Dem\nDivided


As we can see, the pdf reader did not do a great job reading in the data into a dataframe. We will need to clean it up to make it usable.

### Data Wrangling

In [5]:
#This next line of code does the following:
#1) Drops null values from the states column
#2) Flattens it out into one list (.ravel)
#3) Gets rid of asterixs
#4) Joins the list into a string splitting up each state with a new line character '\n'
#5) Splits it back up into a list of all 50 states
states = '\n'.join(legis_control_2017['STATE'].dropna().ravel()).replace('*','').split('\n')[0:50]

states.remove('Nebraska') #Removes Nebraska as we will not be studying it due to its unique state legislature

#These next few lines of code follow a similar process to making the states list
senate_dem = '\n'.join(legis_control_2017['Unnamed: 3'].dropna().ravel()).split('\n')[0:49] #Senate Democrats
senate_rep = '\n'.join(legis_control_2017['Unnamed: 4'].dropna().ravel()).split('\n')[0:49] #Senate Republicans
house_dem = '\n'.join(legis_control_2017['Unnamed: 8'].dropna().ravel()).split('\n')[0:49] #House Democrats
house_rep = '\n'.join(legis_control_2017['Unnamed: 9'].dropna().ravel()).split('\n')[0:49] #House Republicans
#No need to get rid of Nebraska data above because it was already null

#this is not the case for the governor though
gov_party = '\n'.join(legis_control_2017['Gov. \nParty'].dropna().ravel()).split('\n')[0:50] #Party of the Governor
gov_party.pop(26) #Get rid of Nebraska's Governor Party

#We want our numeric data to be the data type of integer, not strings
#Next 4 lines of code convert the lists we made into integers
senate_dem = [int(num) for num in senate_dem]
senate_rep = [int(num) for num in senate_rep]
house_dem = [int(num) for num in house_dem]
house_rep = [int(num) for num in house_rep]

In [6]:
legis_2017 = pd.DataFrame() #Data Frame for our 2017 data

#Make columns for wanted data
legis_2017['states'] = states
legis_2017['senate_dem'] = senate_dem
legis_2017['senate_rep'] = senate_rep
legis_2017['house_dem'] = house_dem
legis_2017['house_rep'] = house_rep
legis_2017['gov_party'] = gov_party

legis_2017

Unnamed: 0,states,senate_dem,senate_rep,house_dem,house_rep,gov_party
0,Alabama,8,26,32,72,Rep
1,Alaska,6,14,17,21,Ind
2,Arizona,13,17,25,35,Rep
3,Arkansas,9,26,24,76,Rep
4,California,27,13,55,25,Dem
5,Colorado,17,18,37,28,Dem
6,Connecticut,18,18,79,72,Dem
7,Delaware,11,10,25,16,Dem
8,Florida,15,25,41,79,Rep
9,Georgia,18,38,62,118,Rep


All looks well except for Louisiana's governor was not "Rep Dem", they were "Dem"

In [7]:
legis_2017.replace('Rep Dem', 'Dem', inplace = True) #Replace Louisianas governor party with the correct one

In [8]:
legis_2017.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   states      49 non-null     object
 1   senate_dem  49 non-null     int64 
 2   senate_rep  49 non-null     int64 
 3   house_dem   49 non-null     int64 
 4   house_rep   49 non-null     int64 
 5   gov_party   49 non-null     object
dtypes: int64(4), object(2)
memory usage: 2.4+ KB


## 2018

In [9]:
#Read in 2018 data
legis_control_2018 = pd.read_csv('Data/Legislative_Control_Data/Control_readin_csvs/Legis_Control_2018-page-1-table-1.csv')
legis_control_2018.head()

Unnamed: 0,STATE,Total \nSeats,Unnamed: 2,Total \nSenate \nSenate \nSenate \nSenate\nDem.\nRep.\nother,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Total \nHouse \nHouse \nHouse \nHouse\nDem.\nRep.\nother,Unnamed: 9,Unnamed: 10,Unnamed: 11,Legis. \nControl,Gov. \nParty,State \nControl
0,Alabama\nAlaska*\nArizona\nArkansas\nCalifornia,140\n60\n90\n135\n120\n100\n187\n62\n160\n236\...,,35\n20\n30\n35\n40,7,26,"1, 1v",,105,33,70,2v,Rep,Rep,Rep
1,,,,,6,14,,,40,16,21,"2, 1v",Rep*,Ind,Divided
2,,,,,13,16,1v,,60,25,35,,Rep,Rep,Rep
3,,,,,9,24,2v,,100,24,75,1v,Rep,Rep,Rep
4,,,,,27,13,,,80,52,25,3v,Dem,Dem,Dem


Slightly different than 2017 but much of the same process will be used.

### Data Wrangling

In [10]:
#See 2017 data wrangling for explanation of code
senate_dem = '\n'.join(legis_control_2018['Unnamed: 4'].dropna().ravel()).split('\n')[0:49] #Senate Democrats
senate_rep = '\n'.join(legis_control_2018['Unnamed: 5'].dropna().ravel()).split('\n')[0:49] #Senate Republicans
house_dem = '\n'.join(legis_control_2018['Unnamed: 9'].dropna().ravel()).split('\n')[0:49] #House Democrats
house_rep = '\n'.join(legis_control_2018['Unnamed: 10'].dropna().ravel()).split('\n')[0:49] #House Republicans

gov_party = '\n'.join(legis_control_2018['Gov. \nParty'].dropna().ravel()).split('\n')[0:50] #Party of the Governor
gov_party.pop(26) #Get rid of Nebraska's Governor Party

#We want our numeric data to be the data type of integer, not strings
#Next 4 lines of code convert the lists we made into integers
senate_dem = [int(num) for num in senate_dem]
senate_rep = [int(num) for num in senate_rep]
house_dem = [int(num) for num in house_dem]
house_rep = [int(num) for num in house_rep]

In [11]:
legis_2018 = pd.DataFrame() #Data Frame for our 2018 data

#Make columns for wanted data
legis_2018['states'] = states
legis_2018['senate_dem'] = senate_dem
legis_2018['senate_rep'] = senate_rep
legis_2018['house_dem'] = house_dem
legis_2018['house_rep'] = house_rep
legis_2018['gov_party'] = gov_party

legis_2018

Unnamed: 0,states,senate_dem,senate_rep,house_dem,house_rep,gov_party
0,Alabama,7,26,33,70,Rep
1,Alaska,6,14,16,21,Ind
2,Arizona,13,16,25,35,Rep
3,Arkansas,9,24,24,75,Rep
4,California,27,13,52,25,Dem
5,Colorado,16,18,37,27,Dem
6,Connecticut,18,18,78,71,Dem
7,Delaware,11,10,25,16,Dem
8,Florida,15,24,40,76,Rep
9,Georgia,19,36,64,114,Rep


Interestingly enough, exact samee issue as 2017 data. Louisina says 'Rep Dem' for the governor when it should just be 'Dem'

In [12]:
legis_2018.replace('Rep Dem', 'Dem', inplace = True) #Replace Louisianas governor party with the correct one

In [13]:
legis_2018.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   states      49 non-null     object
 1   senate_dem  49 non-null     int64 
 2   senate_rep  49 non-null     int64 
 3   house_dem   49 non-null     int64 
 4   house_rep   49 non-null     int64 
 5   gov_party   49 non-null     object
dtypes: int64(4), object(2)
memory usage: 2.4+ KB


## 2019

In [14]:
#Read in 2019 data
legis_control_2019 = pd.read_csv('Data/Legislative_Control_Data/Control_readin_csvs/Legis_Control_2019-page-1-table-1.csv')
legis_control_2019.head()

Unnamed: 0,STATE,Total \nSeats,Unnamed: 2,Total \nSenate \nSenate \nSenate \nSenate\nDem.\nRep.\nother,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Total \nHouse \nHouse \nHouse \nHouse\nDem.\nRep.\nother,Unnamed: 9,Unnamed: 10,Unnamed: 11,Legis. \nControl,Gov. \nParty,State \nControl
0,Alabama\nAlaska\nArizona\nArkansas\nCalifornia,140\n60\n90\n135\n120\n100\n187\n62\n160\n236\...,,35\n20\n30\n35\n40,8,27,,,105,28,77,,Rep,Rep,Rep
1,,,,,7,13,,,40,19,21,,Rep,Rep,Rep
2,,,,,13,17,,,60,29,31,,Rep,Rep,Rep
3,,,,,9,26,,,100,24,76,,Rep,Rep,Rep
4,,,,,29,11,,,80,61,19,,Dem,Dem,Dem


Looks very similar to 2018 and we will take a very similar process again.

### Data Wrangling

In [15]:
#See 2017 data wrangling for explanation of code
senate_dem = '\n'.join(legis_control_2019['Unnamed: 4'].dropna().ravel()).split('\n')[0:49] #Senate Democrats
senate_rep = '\n'.join(legis_control_2019['Unnamed: 5'].dropna().ravel()).split('\n')[0:49] #Senate Republicans
house_dem = '\n'.join(legis_control_2019['Unnamed: 9'].dropna().ravel()).split('\n')[0:49] #House Democrats
house_rep = '\n'.join(legis_control_2019['Unnamed: 10'].dropna().ravel()).split('\n')[0:49] #House Republicans

gov_party = '\n'.join(legis_control_2019['Gov. \nParty'].dropna().ravel()).split('\n')[0:50] #Party of the Governor
gov_party.pop(26) #Get rid of Nebraska's Governor Party

#We want our numeric data to be the data type of integer, not strings
#Next 4 lines of code convert the lists we made into integers
senate_dem = [int(num) for num in senate_dem]
senate_rep = [int(num) for num in senate_rep]
house_dem = [int(num) for num in house_dem]
house_rep = [int(num) for num in house_rep]

In [16]:
legis_2019 = pd.DataFrame() #Data Frame for our 2019 data

#Make columns for wanted data
legis_2019['states'] = states
legis_2019['senate_dem'] = senate_dem
legis_2019['senate_rep'] = senate_rep
legis_2019['house_dem'] = house_dem
legis_2019['house_rep'] = house_rep
legis_2019['gov_party'] = gov_party

legis_2019

Unnamed: 0,states,senate_dem,senate_rep,house_dem,house_rep,gov_party
0,Alabama,8,27,28,77,Rep
1,Alaska,7,13,19,21,Rep
2,Arizona,13,17,29,31,Rep
3,Arkansas,9,26,24,76,Rep
4,California,29,11,61,19,Dem
5,Colorado,19,16,41,24,Dem
6,Connecticut,24,12,92,59,Dem
7,Delaware,12,9,26,15,Dem
8,Florida,17,23,47,73,Rep
9,Georgia,21,35,74,106,Rep


Louisiana does not need to be fixed this time!

In [17]:
legis_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   states      49 non-null     object
 1   senate_dem  49 non-null     int64 
 2   senate_rep  49 non-null     int64 
 3   house_dem   49 non-null     int64 
 4   house_rep   49 non-null     int64 
 5   gov_party   49 non-null     object
dtypes: int64(4), object(2)
memory usage: 2.4+ KB


## 2020

In [18]:
#Read in 2020 data
legis_control_2020 = pd.read_csv('Data/Legislative_Control_Data/Control_readin_csvs/Legis_Control_2020-page-1-table-1.csv')
legis_control_2020.head()

Unnamed: 0,STATE,Total \nSeats,Total \nSenate \nSenate \nSenate \nSenate\nDem.\nRep.\nother,Unnamed: 3,Unnamed: 4,Unnamed: 5,Total \nHouse \nHouse \nHouse \nHouse\nDem.\nRep.\nother,Unnamed: 7,Unnamed: 8,Unnamed: 9,Legis. \nControl,Gov. \nParty,State \nControl
0,Alabama\nAlaska\nArizona\nArkansas\nCalifornia,140\n60\n90\n135\n120\n100\n187\n62\n160\n236\...,35\n20\n30\n35\n40,8,27,,105,28,77,,Rep,Rep,Rep
1,,,,7,13,,40,15,23,2.0,Rep,Rep,Rep
2,,,,13,17,,60,29,31,,Rep,Rep,Rep
3,,,,9,26,,100,24,76,,Rep,Rep,Rep
4,,,,29,10,1v,80,61,18,1.0,Dem,Dem,Dem


This one looks very similar to 2017 and we will take an almost identical aproach.

### Data Wrangling

In [19]:
#See 2017 data wrangling for explanation of code
senate_dem = '\n'.join(legis_control_2020['Unnamed: 3'].dropna().ravel()).split('\n')[0:49] #Senate Democrats
senate_rep = '\n'.join(legis_control_2020['Unnamed: 4'].dropna().ravel()).split('\n')[0:49] #Senate Republicans
house_dem = '\n'.join(legis_control_2020['Unnamed: 7'].dropna().ravel()).split('\n')[0:49] #House Democrats
house_rep = '\n'.join(legis_control_2020['Unnamed: 8'].dropna().ravel()).split('\n')[0:49] #House Republicans
#No need to get rid of Nebraska data above because it was already null

#this is not the case for the governor though
gov_party = '\n'.join(legis_control_2020['Gov. \nParty'].dropna().ravel()).split('\n')[0:50] #Party of the Governor
gov_party.pop(26) #Get rid of Nebraska's Governor Party

#We want our numeric data to be the data type of integer, not strings
#Next 4 lines of code convert the lists we made into integers
senate_dem = [int(num) for num in senate_dem]
senate_rep = [int(num) for num in senate_rep]
house_dem = [int(num) for num in house_dem]
house_rep = [int(num) for num in house_rep]

In [20]:
legis_2020 = pd.DataFrame() #Data Frame for our 2020 data

#Make columns for wanted data
legis_2020['states'] = states
legis_2020['senate_dem'] = senate_dem
legis_2020['senate_rep'] = senate_rep
legis_2020['house_dem'] = house_dem
legis_2020['house_rep'] = house_rep
legis_2020['gov_party'] = gov_party

legis_2020

Unnamed: 0,states,senate_dem,senate_rep,house_dem,house_rep,gov_party
0,Alabama,8,27,28,77,Rep
1,Alaska,7,13,15,23,Rep
2,Arizona,13,17,29,31,Rep
3,Arkansas,9,26,24,76,Rep
4,California,29,10,61,18,Dem
5,Colorado,19,16,41,24,Dem
6,Connecticut,22,14,91,60,Dem
7,Delaware,12,9,26,15,Dem
8,Florida,17,23,47,73,Rep
9,Georgia,21,35,75,105,Rep


Looks good.

In [21]:
legis_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   states      49 non-null     object
 1   senate_dem  49 non-null     int64 
 2   senate_rep  49 non-null     int64 
 3   house_dem   49 non-null     int64 
 4   house_rep   49 non-null     int64 
 5   gov_party   49 non-null     object
dtypes: int64(4), object(2)
memory usage: 2.4+ KB


## 2021

In [22]:
#Read in 2021 data
legis_control_2021 = pd.read_csv('Data/Legislative_Control_Data/Control_readin_csvs/Legis_Control_2021-page-1-table-1.csv')
legis_control_2021.head()

Unnamed: 0,STATE,Total \nSeats,Total \nSenate \nSenate \nSenate \nSenate\nDem.\nRep.\nother,Unnamed: 3,Unnamed: 4,Unnamed: 5,Total \nHouse \nHouse \nHouse \nLeg. \nHouse\nDem.\nRep.\nother\nControl,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Gov. \nParty,State \nControl
0,Alabama\nAlaska\nArizona\nArkansas\nCalifornia,140\n60\n90\n135\n120\n100\n187\n62\n160\n236\...,35\n20\n30\n35\n40\n35\n36\n21\n40\n56\n25\n35...,7,26,2v,105\n40\n60\n100\n80\n65\n151\n41\n120\n180\n5...,28,77,,Rep,Rep,Rep
1,,,,7,13,,,15,21,4,Rep,Rep,Rep
2,,,,14,16,,,28,31,1v,Rep,Rep,Rep
3,,,,7,27,1,,22,78,,Rep,Rep,Rep
4,,,,30,9,1v,,60,19,1,Dem,Dem,Dem


### Data Wrangling

In [23]:
#See 2017 data wrangling for explanation of code
senate_dem = '\n'.join(legis_control_2021['Unnamed: 3'].dropna().ravel()).split('\n')[0:49] #Senate Democrats
senate_rep = '\n'.join(legis_control_2021['Unnamed: 4'].dropna().ravel()).split('\n')[0:49] #Senate Republicans
house_dem = '\n'.join(legis_control_2021['Unnamed: 7'].dropna().ravel()).split('\n')[0:49] #House Democrats
house_rep = '\n'.join(legis_control_2021['Unnamed: 8'].dropna().ravel()).split('\n')[0:49] #House Republicans
#No need to get rid of Nebraska data above because it was already null

#this is not the case for the governor though
gov_party = '\n'.join(legis_control_2021['Gov. \nParty'].dropna().ravel()).split('\n')[0:50] #Party of the Governor
gov_party.pop(26) #Get rid of Nebraska's Governor Party

#We want our numeric data to be the data type of integer, not strings
#Next 4 lines of code convert the lists we made into integers
senate_dem = [int(num) for num in senate_dem]
senate_rep = [int(num) for num in senate_rep]
house_dem = [int(num) for num in house_dem]
house_rep = [int(num) for num in house_rep]

In [24]:
legis_2021 = pd.DataFrame() #Data Frame for our 2020 data

#Make columns for wanted data
legis_2021['states'] = states
legis_2021['senate_dem'] = senate_dem
legis_2021['senate_rep'] = senate_rep
legis_2021['house_dem'] = house_dem
legis_2021['house_rep'] = house_rep
legis_2021['gov_party'] = gov_party

legis_2021

Unnamed: 0,states,senate_dem,senate_rep,house_dem,house_rep,gov_party
0,Alabama,7,26,28,77,Rep
1,Alaska,7,13,15,21,Rep
2,Arizona,14,16,28,31,Rep
3,Arkansas,7,27,22,78,Rep
4,California,30,9,60,19,Dem
5,Colorado,20,15,41,24,Dem
6,Connecticut,23,12,97,54,Dem
7,Delaware,14,7,26,15,Dem
8,Florida,16,24,42,78,Rep
9,Georgia,22,34,76,103,Rep


Looks good.

## 2022

In [25]:
#Read in 2022 data
legis_control_2022 = pd.read_csv('Data/Legislative_Control_Data/Control_readin_csvs/Legis_Control_2022-page-1-table-1.csv')
legis_control_2022.head()

Unnamed: 0,STATE,Total \nSeats,Total \nSenate \nSenate \nSenate \nSenate\nDem.\nRep.\nother,Unnamed: 3,Unnamed: 4,Unnamed: 5,Total \nHouse \nHouse \nHouse \nLeg. \nGov. \nState \nHouse\nDem.\nRep.\nother\nControl\nParty\nControl,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,Alabama\nAlaska\nArizona\nArkansas\nCalifornia,140\n60\n90\n135\n120\n100\n187\n62\n160\n236\...,35\n20\n30\n35\n40\n35\n36\n21\n40\n56\n25\n35...,8,27,,105\n40\n60\n100\n80\n65\n151\n41\n120\n180\n5...,28,77,,Rep,Rep,Rep
1,,,,9,11,,,13,21,6.0,Rep,Rep,Rep
2,,,,14,16,,,29,31,,Rep,Dem,Divided
3,,,,6,29,,,18,82,,Rep,Rep,Rep
4,,,,32,8,,,62,18,,Dem,Dem,Dem


### Data Wrangling

In [26]:
#See 2017 data wrangling for explanation of code
senate_dem = '\n'.join(legis_control_2022['Unnamed: 3'].dropna().ravel()).split('\n')[0:49] #Senate Democrats
senate_rep = '\n'.join(legis_control_2022['Unnamed: 4'].dropna().ravel()).split('\n')[0:49] #Senate Republicans
house_dem = '\n'.join(legis_control_2022['Unnamed: 7'].dropna().ravel()).split('\n')[0:49] #House Democrats
house_rep = '\n'.join(legis_control_2022['Unnamed: 8'].dropna().ravel()).split('\n')[0:49] #House Republicans
#No need to get rid of Nebraska data above because it was already null

#this is not the case for the governor though
gov_party = '\n'.join(legis_control_2022['Unnamed: 11'].dropna().ravel()).split('\n')[0:50] #Party of the Governor
gov_party.pop(26) #Get rid of Nebraska's Governor Party

#We want our numeric data to be the data type of integer, not strings
#Next 4 lines of code convert the lists we made into integers
senate_dem = [int(num) for num in senate_dem]
senate_rep = [int(num) for num in senate_rep]
house_dem = [int(num) for num in house_dem]
house_rep = [int(num) for num in house_rep]

In [27]:
legis_2022 = pd.DataFrame() #Data Frame for our 2020 data

#Make columns for wanted data
legis_2022['states'] = states
legis_2022['senate_dem'] = senate_dem
legis_2022['senate_rep'] = senate_rep
legis_2022['house_dem'] = house_dem
legis_2022['house_rep'] = house_rep
legis_2022['gov_party'] = gov_party

legis_2022

Unnamed: 0,states,senate_dem,senate_rep,house_dem,house_rep,gov_party
0,Alabama,8,27,28,77,Rep
1,Alaska,9,11,13,21,Rep
2,Arizona,14,16,29,31,Dem
3,Arkansas,6,29,18,82,Rep
4,California,32,8,62,18,Dem
5,Colorado,23,12,46,19,Dem
6,Connecticut,24,12,98,53,Dem
7,Delaware,15,6,26,15,Dem
8,Florida,12,28,35,85,Rep
9,Georgia,23,33,79,101,Rep


Looks good except Massachusetts governor should be jus "Dem" and not "Dem Dem Dem".

In [28]:
legis_2022.replace('Dem Dem Dem', 'Dem', inplace = True) #Replace MA gov party with correct value

In [29]:
legis_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   states      49 non-null     object
 1   senate_dem  49 non-null     int64 
 2   senate_rep  49 non-null     int64 
 3   house_dem   49 non-null     int64 
 4   house_rep   49 non-null     int64 
 5   gov_party   49 non-null     object
dtypes: int64(4), object(2)
memory usage: 2.4+ KB


## 2023

In [30]:
#Read in 2023 data
legis_control_2023 = pd.read_csv('Data/Legislative_Control_Data/Control_readin_csvs/Legis_Control_2023-page-1-table-1.csv')
legis_control_2023.head()

Unnamed: 0,STATE,Total \nSeats,Total \nSenate,Senate \nDem.,Senate \nRep.,Senate \nother,Unnamed: 6,Total \nHouse,House \nDem.,House \nRep.,House \nother,Leg. \nControl,Gov. \nParty,State \nControl
0,Alabama\nAlaska\nArizona\nArkansas\nCalifornia,140\n60\n90\n135\n120\n100\n187\n62\n160\n236\...,35,8,27,,,105,28,77,,Rep,Rep,Rep
1,,,20,9,11,,,40,13,21,6,Rep,Rep,Rep
2,,,30,13,16,1v,,60,29,30,1v,Rep,Dem,Divided
3,,,35,6,29,,,100,18,82,,Rep,Rep,Rep
4,,,40,32,8,,,80,62,18,,Dem,Dem,Dem


### Data Wrangling

In [31]:
#See 2017 data wrangling for explanation of code
senate_dem = '\n'.join(legis_control_2023['Senate \nDem.'].dropna().ravel()).split('\n')[0:49] #Senate Democrats
senate_rep = '\n'.join(legis_control_2023['Senate \nRep.'].dropna().ravel()).split('\n')[0:49] #Senate Republicans
house_dem = '\n'.join(legis_control_2023['House \nDem.'].dropna().ravel()).split('\n')[0:49] #House Democrats
house_rep = '\n'.join(legis_control_2023['House \nRep.'].dropna().ravel()).split('\n')[0:49] #House Republicans
#No need to get rid of Nebraska data above because it was already null

#this is not the case for the governor though
gov_party = '\n'.join(legis_control_2023['Gov. \nParty'].dropna().ravel()).split('\n')[0:50] #Party of the Governor
gov_party.pop(26) #Get rid of Nebraska's Governor Party

#We want our numeric data to be the data type of integer, not strings
#Next 4 lines of code convert the lists we made into integers
senate_dem = [int(num) for num in senate_dem]
senate_rep = [int(num) for num in senate_rep]
house_dem = [int(num) for num in house_dem]
house_rep = [int(num) for num in house_rep]

In [32]:
legis_2023 = pd.DataFrame() #Data Frame for our 2023 data

#Make columns for wanted data
legis_2023['states'] = states
legis_2023['senate_dem'] = senate_dem
legis_2023['senate_rep'] = senate_rep
legis_2023['house_dem'] = house_dem
legis_2023['house_rep'] = house_rep
legis_2023['gov_party'] = gov_party

legis_2023

Unnamed: 0,states,senate_dem,senate_rep,house_dem,house_rep,gov_party
0,Alabama,8,27,28,77,Rep
1,Alaska,9,11,13,21,Rep
2,Arizona,13,16,29,30,Dem
3,Arkansas,6,29,18,82,Rep
4,California,32,8,62,18,Dem
5,Colorado,23,12,46,19,Dem
6,Connecticut,24,12,98,53,Dem
7,Delaware,15,6,26,15,Dem
8,Florida,12,28,35,85,Rep
9,Georgia,23,33,80,100,Rep


Looks good.

In [152]:
legis_2023.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   states      49 non-null     object
 1   senate_dem  49 non-null     int64 
 2   senate_rep  49 non-null     int64 
 3   house_dem   49 non-null     int64 
 4   house_rep   49 non-null     int64 
 5   gov_party   49 non-null     object
dtypes: int64(4), object(2)
memory usage: 2.4+ KB


## Saving Usable Legislative control Data to CSVs

In [33]:
#Converting each cleaned dataframe into a csv to be used for later
legis_2017.to_csv('Data/Legislative_Control_Data/legis_control_2017.csv', index= False)
legis_2018.to_csv('Data/Legislative_Control_Data/legis_control_2018.csv', index= False)
legis_2019.to_csv('Data/Legislative_Control_Data/legis_control_2019.csv', index= False)
legis_2020.to_csv('Data/Legislative_Control_Data/legis_control_2020.csv', index= False)
legis_2021.to_csv('Data/Legislative_Control_Data/legis_control_2021.csv', index= False)
legis_2022.to_csv('Data/Legislative_Control_Data/legis_control_2022.csv', index= False)
legis_2023.to_csv('Data/Legislative_Control_Data/legis_control_2023.csv', index= False)