In [1]:
import pandas as pd
import os

In [2]:
#Set the working directory
project_folder = '/Users/grantschwab/Desktop/RDH/Projects/mississippi_general2022/update'
os.chdir(project_folder)

In [3]:
#Import csv of contests and column names
ms_names = pd.read_csv("./field_names.csv")

#Cleaning how contest descriptions appear
ms_names["0"] = ms_names["0"].apply(lambda x: " ".join(x.split("-:-")[:2])+" ("+x.split("-:-")[2].replace("PARTY:","")+")")

#Preview
ms_names

Unnamed: 0.1,Unnamed: 0,0
0,GCON04LJOH,US HOUSE ALDEN PATRICK JOHNSON (LIB)
1,GCON02DTHO,US HOUSE BENNIE G THOMPSON (DEM)
2,GCON02RFLO,US HOUSE BRIAN FLOWERS (REP)
3,GCON01DBLA,US HOUSE DIANNE BLACK (DEM)
4,GCON04DDUP,US HOUSE JOHNNY L DUPREE (DEM)
5,GCON03RGUE,US HOUSE MICHAEL GUEST (REP)
6,GCON04REZE,US HOUSE MIKE EZELL (REP)
7,GCON03DYOU,US HOUSE SHUWASKI A YOUNG (DEM)
8,GCON01RKEL,US HOUSE TRENT KELLY (REP)


In [4]:
#Create sorted dictionary of contests and column names
contest_dict = dict(zip(ms_names["Unnamed: 0"], ms_names["0"]))
myKeys = list(contest_dict.keys())
myKeys.sort()
contest_dict = {i: contest_dict[i] for i in myKeys}

In [5]:
contest_dict

{'GCON01DBLA': 'US HOUSE DIANNE BLACK (DEM)',
 'GCON01RKEL': 'US HOUSE TRENT KELLY (REP)',
 'GCON02DTHO': 'US HOUSE BENNIE G THOMPSON (DEM)',
 'GCON02RFLO': 'US HOUSE BRIAN FLOWERS (REP)',
 'GCON03DYOU': 'US HOUSE SHUWASKI A YOUNG (DEM)',
 'GCON03RGUE': 'US HOUSE MICHAEL GUEST (REP)',
 'GCON04DDUP': 'US HOUSE JOHNNY L DUPREE (DEM)',
 'GCON04LJOH': 'US HOUSE ALDEN PATRICK JOHNSON (LIB)',
 'GCON04REZE': 'US HOUSE MIKE EZELL (REP)'}

In [6]:
###Create README

base_dict = {'UNIQUE_ID':'Unique ID for each precinct',
'COUNTYFP':'County FIPS identifier',
'CNTY_CODE':'County Code (Three-character abbreviation)',
'CNTY_NAME':'County Name',
'POLL_LOC':'Precinct Polling Location'}

fields_dict = {**base_dict, **contest_dict}

title = "Mississippi 2022 General Election Precinct-Level Results"
retrieval_date = "09/18/23"
fields_dict = fields_dict
github_link = "https://github.com/nonpartisan-redistricting-datahub/pber_collection"
file_folder = "./output/ms_2022_gen_prec/"
source = "MIT Election Data and Science Lab (MEDSL)"

In [7]:
fields_dict

{'UNIQUE_ID': 'Unique ID for each precinct',
 'COUNTYFP': 'County FIPS identifier',
 'CNTY_CODE': 'County Code (Three-character abbreviation)',
 'CNTY_NAME': 'County Name',
 'POLL_LOC': 'Precinct Polling Location',
 'GCON01DBLA': 'US HOUSE DIANNE BLACK (DEM)',
 'GCON01RKEL': 'US HOUSE TRENT KELLY (REP)',
 'GCON02DTHO': 'US HOUSE BENNIE G THOMPSON (DEM)',
 'GCON02RFLO': 'US HOUSE BRIAN FLOWERS (REP)',
 'GCON03DYOU': 'US HOUSE SHUWASKI A YOUNG (DEM)',
 'GCON03RGUE': 'US HOUSE MICHAEL GUEST (REP)',
 'GCON04DDUP': 'US HOUSE JOHNNY L DUPREE (DEM)',
 'GCON04LJOH': 'US HOUSE ALDEN PATRICK JOHNSON (LIB)',
 'GCON04REZE': 'US HOUSE MIKE EZELL (REP)'}

In [8]:
def full_readme_text(title, retrieval_date, source, fields_dict, github_link):

#First section of README
  readme_p1 = '''{title}\n
## RDH Date Retrieval
{retrieval_date}

## Sources
{source}'''.format(title = title, source = source, retrieval_date = retrieval_date)

#Second section of README
  readme_p2 = '''\n
## Notes on Field Names (adapted from VEST):
Columns reporting votes generally follow the pattern:
One example is:
G16PRERTRU
The first character is G for a general election, P for a primary, S for a special, and R for a runoff.
Characters 2 and 3 are the year of the election.*
Characters 4-6 represent the office type (see list below).
Character 7 represents the party of the candidate.
Characters 8-10 are the first three letters of the candidate's last name.

*To fit within the GIS 10 character limit for field names, the naming convention is slightly different for the State Legislature and US House of Representatives. All fields are listed below with definitions.

Office Codes Used:
CON## - U.S. Congress

Party Codes Used:
D - Democratic
R - Republican
L - Libertarian
'''

#Third section of README
  fields_table = pd.DataFrame.from_dict(fields_dict.items())
  fields_table.columns = ["Field Name", "Description"]
  readme_p3 = '''\n## Fields:\n''' + fields_table.to_string(formatters={'Description':'{{:<{}s}}'.format(fields_table['Description'].str.len().max()).format, 'Field Name':'{{:<{}s}}'.format(fields_table['Field Name'].str.len().max()).format}, index=False, justify = "left")

#Fourth section
  readme_p4 = '''\n
## Additional Notes
MEDSL precinct-level data was checked against county-level election results. Those results were compiled by RDH staff using PDF files from the Mississippi Secretary of State with precinct-level data for each county, retrieved 9/13/23. (https://www.sos.ms.gov/elections-voting/2022-general-election-results)

All results matched.

However, a previous version of the MEDSL data (retrieved 9/7/23) was checked against those same county files from the Mississippi Secretary of State.

Totals matched in every county except for the following: Sharkey, Tallahatchie, Lee, Pontotoc, Jackson, Pearl River, Clarke, Oktibbeha, Neshoba, Rankin

There were discrepancies in 11 of 1757 precincts. In cases where precinct results did not match, MEDSL data was replaced with state data. That resolved all discrepancies.

The election results file on the RDH website was correct and remains unchanged, but we note the use of updated data from MEDSL. Full details on processing of the most recent MEDSL data can be found in the notebook linked below.
'''

#Fifth section of README
  readme_p5 = '''\n
## Processing Steps
Visit the RDH GitHub and the processing script for this code [here]({github_link})

Please direct questions related to processing this dataset to info@redistrictingdatahub.org.
'''.format(github_link=github_link)

  full_readme = str(readme_p1)+str(readme_p2)+str(readme_p3)+str(readme_p4)+str(readme_p5)
  return full_readme

In [9]:
if not os.path.exists(file_folder):
    os.mkdir(file_folder)

with open(file_folder+"README.txt", 'w') as tf:
        tf.write(full_readme_text(title, retrieval_date, source, fields_dict, github_link))