The following process will be followed when ingesting voter data

1. Parse the raw data and place it into the MySQL 'voters_raw' table
2. Read the data from the 'voters_raw' table, process the data, and place it in the 'voters_processed' table 

# Preprocessing Voter Data

In [75]:
# Importing libraries
import numpy as np
import pandas as pd
import re
import itertools
import sys
import mysql.connector
import sqlalchemy
import geopandas as gpd
import geopy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter
import matplotlib.pyplot as plt
import folium
from folium.plugins import FastMarkerCluster
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 500)

In [11]:
# Check how many lines are in the files
root = '/Users/mareksalamon/Desktop/Politiker/Code/App Demo/Politiker/public/data'
files = [f'{root}/QN_All_20191231.txt', f'{root}/BK-VoterData-By street 2.txt']

for file in files:

    i = 0

    # Open and read the file
    with open(file, "r") as f:
        for line in itertools.islice(f, sys.maxsize):

            i += 1
    print(f"Number of pages in {file}: ", i)

Number of pages in /Users/mareksalamon/Desktop/Politiker/Code/App Demo/Politiker/public/data/QN_All_20191231.txt:  1213369
Number of pages in /Users/mareksalamon/Desktop/Politiker/Code/App Demo/Politiker/public/data/BK-VoterData-By street 2.txt:  1499712


## Processing QN_All_20191231

In [12]:
%%time

# Specify the name of the file to be processed
filename = f"{root}/QN_All_20191231.txt"

# Initialize an empty dataframe
qn_df = pd.DataFrame(columns=['County EMSID', 'Last Name', 'First Name', 'Middle Initial', 'Name Suffix', 
                              'House Number', 'Apartment Number', 'Street Name', 'City', 'Zip Code', 'Date of Birth', 
                              'Gender', 'Political Party', 'Election District', 'Assembly District',
                              'Congress District', 'Council District', 'Senate District', 'Civil Court District',
                              'Judicial District', 'Registration Date', 'Status Code', 'Voter Type', 
                              'Eff Status Change Date', 'Year Last Voted', 'Telephone Number'])
num_cols = len(qn_df.columns)

# Open and read the file
with open(filename, "r") as f:
    for line in itertools.islice(f, 0, 1000):
        # sys.maxsize 0, 6)
        
        try:
        
            # Seperate the line on where 2 or more spaces are present
            new_line = re.split(r'\s{2,}', line)

            # Seperate first element into two
            if len(new_line[0].split()) > 1:
                new_el = new_line[0].split(" ")
                new_line.remove(new_line[0])
                new_line.insert(0, new_el[1])
                new_line.insert(0, new_el[0])
            else:
                new_el = re.split('([0-9]{9})', new_line[0])
                new_line.remove(new_line[0])
                new_line.insert(0, new_el[2])
                new_line.insert(0, new_el[1])

            # Checking for 'Middle Initial' and 'Name Suffix' columns
            lst = ['sr','jr','ii','iii','iv']

            if new_line[3] in lst:
                new_line.insert(3, None)
            elif (new_line[3] not in lst) and (len(new_line[3]) == 1) and (new_line[3].isalpha()) and (new_line[4] not in lst):
                new_line.insert(4, None)
            elif new_line[3].isdigit():
                new_line.insert(3, None)
                new_line.insert(3, None)
            elif (new_line[3] == None) and (new_line[4].isdigit()):
                new_line.insert(3, None)
            elif (new_line[3] == None) and (new_line[4] == None):
                pass

            # Checking for House #, House Suffix, and Apartment number
            if new_line[5].isdigit() and any(item in [x.lower() for x in new_line[6].split(' ')]  for item in ['street','avenue','road']):
                new_line.insert(6, None)
                
            # Limiting zip codes to the first 5 numbers
            new_line[9] = new_line[9][:5]
            
            # Seperating 10th column into DOB, Gender, and Political Party
            if len(new_line[10]) >= 10:
                new_el = re.split('([0-9]{8})', new_line[10])[1:]
                dob = new_el[0]
                gender = new_el[1][0:1]
                pp = new_el[1][1:]

                new_line.remove(new_line[10])
                new_line.insert(10, dob)
                new_line.insert(11, gender)
                new_line.insert(12, pp)

            # If the last element is an empty string, drop it
            if new_line[-1] == '':
                new_line.pop()

            # Seperate the last element into it's respective columns
            if (len(new_line[-1]) > 25):

                split1 = new_line[-1].split(" ", 1)[0]
                split2 = new_line[-1].split(" ", 1)[1]

                # Districts
                elec_dis = split1[0:3] 
                assem_dis = split1[3:5]
                cong_dis = split1[5:7]
                coun_dis = split1[7:9]
                sen_dis = split1[9:11]
                civ_dis = split1[11:13]
                jud_dis = split1[13:15]

                # Registration Date
                reg_date = split1[15:23]

                # Status Code
                stat_code = split1[23:25]
                stat_code = stat_code.strip()

                # Voter Type    
                voter_type = split2[0]

                # Eff Status Change Date
                try:
                    # Checking if index 1 is out of range
                    esc_date = split2[1] 
                    # Continue if not
                    esc_date = split2[1:9]
                except:
                    esc_date = None

                # Year Last Voted
                try:
                    # Checking if index 9 is out of range
                    yr_last_vote = split2[9]
                    # Continue if not
                    yr_last_vote = split2[9:13]
                except:
                    yr_last_vote = None

                # Telephone
                try:
                    # Check if index 13 is out of range
                    tele_num = split2[13]
                    # Continue if not
                    tele_num = split2[13:]

                    if '-' in tele_num:
                        pass
                    else:
                        tele_num = tele_num.replace(" ", "")
                        tele_num = tele_num[0:3] + '-' + tele_num[3:6] + '-' + tele_num[6:]
                except:
                    tele_num = None

                # Remove last element and insert list of elements in its place
                new_line.pop()
                new_line.extend([elec_dis, assem_dis, cong_dis, coun_dis, sen_dis, civ_dis, jud_dis, reg_date, stat_code, 
                                 voter_type, esc_date, yr_last_vote, tele_num])

    #         print(new_line)
    #         print('\n')

            # Add the data to the dataframe
            try:
                qn_df.loc[len(qn_df)] = new_line
            except:
                qn_df.loc[len(qn_df)] = [None] * num_cols
                
        except:
            qn_df.loc[len(qn_df)] = [None] * num_cols
            continue

f.close()  

print("Processing Complete!")


Processing Complete!
CPU times: user 5.21 s, sys: 73.1 ms, total: 5.29 s
Wall time: 5.44 s


In [13]:
qn_df

Unnamed: 0,County EMSID,Last Name,First Name,Middle Initial,Name Suffix,House Number,Apartment Number,Street Name,City,Zip Code,Date of Birth,Gender,Political Party,Election District,Assembly District,Congress District,Council District,Senate District,Civil Court District,Judicial District,Registration Date,Status Code,Voter Type,Eff Status Change Date,Year Last Voted,Telephone Number
0,Q1205093,ABBRACCIAMENTO,FRANCES,,,102,,BEACH 221 STREET,BREEZY POINT,11697,19130509.0,F,DEM,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19840101.0,A,P,19840101.0,2019.0,
1,303652678,ADESSO,FRANK,,,113,,BEACH 215 STREET,BREEZY POINT,11697,19380514.0,M,BLK,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19921009.0,A,R,19921009.0,2011.0,
2,303651745,ADESSO,MARY,,,113,,BEACH 215 STREET,BREEZY POINT,11697,19470521.0,F,BLK,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19890327.0,A,R,19890327.0,2016.0,
3,02610005,AHERN,ELIZABETH,,,51,,BEACH 217 STREET,BREEZY POINT,11697,19420312.0,F,REP,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19730101.0,A,R,19730101.0,2018.0,
4,Q1197516,AHERN,MARGARET,M,,107,,BEACH 222 STREET,BREEZY POINT,11697,19650510.0,F,BLK,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19840101.0,A,R,19840101.0,2018.0,
5,02610006,AHERN,ROBERT,S,,51,,BEACH 217 STREET,BREEZY POINT,11697,19370531.0,M,REP,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19730101.0,A,R,19730101.0,2019.0,
6,Q1197525,AHERN,ROSEMARY,M,,107,,BEACH 222 STREET,BREEZY POINT,11697,19410510.0,F,BLK,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19840101.0,A,R,19840101.0,2019.0,
7,02312614,ALLEVA,MARIE,B,,12,,BEACH 220 STREET,BREEZY POINT,11697,19260414.0,F,REP,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19750101.0,A,R,19750101.0,2018.0,
8,303533895,AMARI,JACQUELINE,A,,115,,BEACH 215 STREET,BREEZY POINT,11697,19600413.0,F,DEM,1.0,23.0,5.0,32.0,15.0,5.0,11.0,19840101.0,A,R,20041215.0,2019.0,
9,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
qn_df.dropna(how='all',axis=0) 

Unnamed: 0,County EMSID,Last Name,First Name,Middle Initial,Name Suffix,House Number,Apartment Number,Street Name,City,Zip Code,Date of Birth,Gender,Political Party,Election District,Assembly District,Congress District,Council District,Senate District,Civil Court District,Judicial District,Registration Date,Status Code,Voter Type,Eff Status Change Date,Year Last Voted,Telephone Number
0,Q1205093,ABBRACCIAMENTO,FRANCES,,,102,,BEACH 221 STREET,BREEZY POINT,11697,19130509,F,DEM,1,23,5,32,15,5,11,19840101,A,P,19840101.0,2019.0,
1,303652678,ADESSO,FRANK,,,113,,BEACH 215 STREET,BREEZY POINT,11697,19380514,M,BLK,1,23,5,32,15,5,11,19921009,A,R,19921009.0,2011.0,
2,303651745,ADESSO,MARY,,,113,,BEACH 215 STREET,BREEZY POINT,11697,19470521,F,BLK,1,23,5,32,15,5,11,19890327,A,R,19890327.0,2016.0,
3,02610005,AHERN,ELIZABETH,,,51,,BEACH 217 STREET,BREEZY POINT,11697,19420312,F,REP,1,23,5,32,15,5,11,19730101,A,R,19730101.0,2018.0,
4,Q1197516,AHERN,MARGARET,M,,107,,BEACH 222 STREET,BREEZY POINT,11697,19650510,F,BLK,1,23,5,32,15,5,11,19840101,A,R,19840101.0,2018.0,
5,02610006,AHERN,ROBERT,S,,51,,BEACH 217 STREET,BREEZY POINT,11697,19370531,M,REP,1,23,5,32,15,5,11,19730101,A,R,19730101.0,2019.0,
6,Q1197525,AHERN,ROSEMARY,M,,107,,BEACH 222 STREET,BREEZY POINT,11697,19410510,F,BLK,1,23,5,32,15,5,11,19840101,A,R,19840101.0,2019.0,
7,02312614,ALLEVA,MARIE,B,,12,,BEACH 220 STREET,BREEZY POINT,11697,19260414,F,REP,1,23,5,32,15,5,11,19750101,A,R,19750101.0,2018.0,
8,303533895,AMARI,JACQUELINE,A,,115,,BEACH 215 STREET,BREEZY POINT,11697,19600413,F,DEM,1,23,5,32,15,5,11,19840101,A,R,20041215.0,2019.0,
11,301602249,AMBERY,MARY,R,,108,,BEACH 214 STREET,BREEZY POINT,11697,19590207,F,DEM,1,23,5,32,15,5,11,19920930,A,R,20160607.0,2018.0,


In [151]:
# Open and read the file
with open(filename, "r") as f:
    for line in itertools.islice(f, 1058900, 1058904):
        # sys.maxsize
        
        # Seperate the line on where 2 or more spaces are present
        new_line = re.split(r'\s{2,}', line)
        
        print(new_line)
        print('\n')

['305966366MAURIELLO', 'ROBERT', 'E', '1630', 'STEPHEN STREET', 'RIDGEWOOD', '11385', '19670117MBLK', '00738073412031120041008A R200410082018', '']


['411249559MAXWELL', 'ELS', 'N', '1631', '2', 'SUMMERFIELD STREET', 'RIDGEWOOD', '11385', '19770408FIND', '00738073412031120121013A R20121013', '']


['410632976MC BEAN', 'OLIVE', 'E', '1652', '1', 'NORMAN STREET', 'RIDGEWOOD', '11385', '1652', 'NORMAN', 'STREET', 'NUM 1', 'RIDGEWOOD', 'QUEENS', 'NY', '11385', '19620505FDEM', '00738073412031120090624A H200906242012718 795 7590', '']


['301857242MCBEAN', 'CLEMON', '1652', '1', 'NORMAN STREET', 'RIDGEWOOD', '11385', '19310807MDEM', '00738073412031119921009A R199210092009', '']




In [89]:
# Finding the line with the most information on a voter

len_list = []
i = 0

# Open and read the file
with open(filename, "r") as f:
    for line in itertools.islice(f,999999999):
        
        # Seperate the line on where 2 or more spaces are present
        new_line = re.split(r'\s{2,}', line)
        length = len(new_line)
        value = (i,length)
        len_list.append(value)
        
        i += 1
        
from operator import itemgetter

max(len_list,key=itemgetter(1))#[0]

(1058902, 19)

In [93]:
# Open and read the file
with open(filename, "r") as f:
    for line in itertools.islice(f,1058900, 1058904):
        
        new_line = re.split(r'\s{2,}', line)
        
        print(new_line)
        print('\n')

['305966366MAURIELLO', 'ROBERT', 'E', '1630', 'STEPHEN STREET', 'RIDGEWOOD', '11385', '19670117MBLK', '00738073412031120041008A R200410082018', '']


['411249559MAXWELL', 'ELS', 'N', '1631', '2', 'SUMMERFIELD STREET', 'RIDGEWOOD', '11385', '19770408FIND', '00738073412031120121013A R20121013', '']


['410632976MC BEAN', 'OLIVE', 'E', '1652', '1', 'NORMAN STREET', 'RIDGEWOOD', '11385', '1652', 'NORMAN', 'STREET', 'NUM 1', 'RIDGEWOOD', 'QUEENS', 'NY', '11385', '19620505FDEM', '00738073412031120090624A H200906242012718 795 7590', '']


['301857242MCBEAN', 'CLEMON', '1652', '1', 'NORMAN STREET', 'RIDGEWOOD', '11385', '19310807MDEM', '00738073412031119921009A R199210092009', '']




## Processing BK-VoterData-By street 2

In [14]:
%%time

root = '/Users/mareksalamon/Desktop/Politiker/Code/App Demo/Politiker/public/data'
# Specify the name of the file to be processed
filename = f"{root}/BK-VoterData-By street 2.txt"

# Initialize an empty dataframe
bk_df = pd.DataFrame(columns=['County EMSID', 'Last Name', 'First Name', 'Middle Initial', 'Name Suffix', 
                              'House Number', 'Apartment Number', 'Street Name', 'City', 'Zip Code', 'Date of Birth', 
                              'Gender', 'Political Party', 'Election District', 'Assembly District',
                              'Congress District', 'Council District', 'Senate District', 'Civil Court District',
                              'Judicial District', 'Registration Date', 'Status Code', 'Voter Type', 
                              'Eff Status Change Date', 'Year Last Voted', 'Telephone Number'])

num_cols = len(bk_df.columns)


# Open and read the file
with open(filename, "r") as f:
    for line in itertools.islice(f, 0, 1000):
        
        try:
        
            # Seperate the line on where 2 or more spaces are present
            new_line = re.split(r'\s{2,}', line)

            # Seperate first element into two
            if len(new_line[0].split()) > 1:
                new_el = new_line[0].split(" ")
                new_line.remove(new_line[0])
                new_line.insert(0, new_el[1])
                new_line.insert(0, new_el[0])
            else:
                new_el = re.split('([0-9]{9})', new_line[0])
                new_line.remove(new_line[0])
                new_line.insert(0, new_el[2])
                new_line.insert(0, new_el[1])

            # Checking for 'Middle Initial' and 'Name Suffix' columns
            lst = ['sr','jr','ii','iii','iv']

            if new_line[3] in lst:
                new_line.insert(3, None)
            elif (new_line[3] not in lst) and (len(new_line[3]) == 1) and (new_line[3].isalpha()) and (new_line[4] not in lst):
                new_line.insert(4, None)
            elif new_line[3].isdigit():
                new_line.insert(3, None)
                new_line.insert(3, None)
            elif (new_line[3] == None) and (new_line[4].isdigit()):
                new_line.insert(3, None)
            elif (new_line[3] == None) and (new_line[4] == None):
                pass

            # Checking for House #, House Suffix, and Apartment number
            if new_line[5].isdigit() and any(item in [x.lower() for x in new_line[6].split(' ')]  for item in ['street','avenue','road']):
                new_line.insert(6, None)
                
            # Limiting zip codes to the first 5 numbers
            new_line[9] = new_line[9][:5]

            # Seperating 10th column into DOB, Gender, and Political Party
            if len(new_line[10]) >= 10:
                new_el = re.split('([0-9]{8})', new_line[10])[1:]
                dob = new_el[0]
                gender = new_el[1][0:1]
                pp = new_el[1][1:]

                new_line.remove(new_line[10])
                new_line.insert(10, dob)
                new_line.insert(11, gender)
                new_line.insert(12, pp)

            # If the last element is an empty string, drop it
            if new_line[-1] == '':
                new_line.pop()

            # Seperate the last element into it's respective columns
            if (len(new_line[-1]) > 25):

                split1 = new_line[-1].split(" ", 1)[0]
                split2 = new_line[-1].split(" ", 1)[1]

                # Districts
                elec_dis = split1[0:3] 
                assem_dis = split1[3:5]
                cong_dis = split1[5:7]
                coun_dis = split1[7:9]
                sen_dis = split1[9:11]
                civ_dis = split1[11:13]
                jud_dis = split1[13:15]

                # Registration Date
                reg_date = split1[15:23]

                # Status Code
                stat_code = split1[23:25]
                stat_code = stat_code.strip()

                # Voter Type    
                voter_type = split2[0]

                # Eff Status Change Date
                try:
                    # Checking if index 1 is out of range
                    esc_date = split2[1] 
                    # Continue if not
                    esc_date = split2[1:9]
                except:
                    esc_date = None

                # Year Last Voted
                try:
                    # Checking if index 9 is out of range
                    yr_last_vote = split2[9]
                    # Continue if not
                    yr_last_vote = split2[9:13]
                except:
                    yr_last_vote = None

                # Telephone
                try:
                    # Check if index 13 is out of range
                    tele_num = split2[13]
                    # Continue if not
                    tele_num = split2[13:]

                    if '-' in tele_num:
                        pass
                    else:
                        tele_num = tele_num.replace(" ", "")
                        tele_num = tele_num[0:3] + '-' + tele_num[3:6] + '-' + tele_num[6:]
                except:
                    tele_num = None

                # Remove last element and insert list of elements in its place
                new_line.pop()
                new_line.extend([elec_dis, assem_dis, cong_dis, coun_dis, sen_dis, civ_dis, jud_dis, reg_date, stat_code, 
                                 voter_type, esc_date, yr_last_vote, tele_num])

                # Add the data to the dataframe
                try:
                    bk_df.loc[len(bk_df)] = new_line
                except:
                    bk_df.loc[len(bk_df)] = [None] * num_cols
                
        except:
            bk_df.loc[len(bk_df)] = [None] * num_cols
            continue
        
#         print(new_line)
#         print('\n')
        
f.close()

print("Processing Complete!")


Processing Complete!
CPU times: user 4.14 s, sys: 31 ms, total: 4.17 s
Wall time: 4.21 s


In [15]:
bk_df

Unnamed: 0,County EMSID,Last Name,First Name,Middle Initial,Name Suffix,House Number,Apartment Number,Street Name,City,Zip Code,Date of Birth,Gender,Political Party,Election District,Assembly District,Congress District,Council District,Senate District,Civil Court District,Judicial District,Registration Date,Status Code,Voter Type,Eff Status Change Date,Year Last Voted,Telephone Number
0,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,
4,410673045,ZELINGER,JAY,I,,1904.0,,AVENUE N,Brooklyn,11230.0,19460913.0,F,DEM,1.0,41.0,9.0,48.0,17.0,8.0,2.0,20090911.0,A,R,20090911,,
5,411509548,GLUSTEIN,CHARA,G,,1912.0,,AVENUE N,Brooklyn,11230.0,19730629.0,F,BLK,1.0,41.0,9.0,48.0,17.0,8.0,2.0,20130822.0,A,R,20130822,,
6,900130441,GLUSTEIN,AHARON,M,,1912.0,2FLR,AVENUE N,BROOKLYN,11230.0,19700208.0,M,DEM,1.0,41.0,9.0,48.0,17.0,8.0,2.0,19880620.0,A,R,19880620,2019.0,
7,304688775,GLUSTEIN,CHAVA,G,,1912.0,,AVENUE N,BROOKLYN,11230.0,19730629.0,F,DEM,1.0,41.0,9.0,48.0,17.0,8.0,2.0,20000101.0,A,R,20080711,2018.0,718-645-1452
8,305289680,DAHAN,AVRAHAM,,,1912.0,2 FL,AVENUE N,BROOKLYN,11230.0,19840517.0,M,REP,1.0,41.0,9.0,48.0,17.0,8.0,2.0,20020708.0,A,R,20020708,2011.0,917-200-5499
9,301698215,HASS,MICKEY,,,1914.0,,AVENUE N,Brooklyn,11230.0,19281221.0,M,BLK,1.0,41.0,9.0,48.0,17.0,8.0,2.0,19921009.0,A,R,20160606,2002.0,


In [14]:
bk_df.dropna(how='all',axis=0) 

Unnamed: 0,County EMSID,Last Name,First Name,Middle Initial,Name Suffix,House Number,Apartment Number,Street Name,City,Zip Code,Date of Birth,Gender,Political Party,Election District,Assembly District,Congress District,Council District,Senate District,Civil Court District,Judicial District,Registration Date,Status Code,Voter Type,Eff Status Change Date,Year Last Voted,Telephone Number
4,410673045,ZELINGER,JAY,I,,1904,,AVENUE N,Brooklyn,11230,19460913,F,DEM,1,41,9,48,17,8,2,20090911,A,R,20090911,,
5,411509548,GLUSTEIN,CHARA,G,,1912,,AVENUE N,Brooklyn,11230,19730629,F,BLK,1,41,9,48,17,8,2,20130822,A,R,20130822,,
6,900130441,GLUSTEIN,AHARON,M,,1912,2FLR,AVENUE N,BROOKLYN,11230,19700208,M,DEM,1,41,9,48,17,8,2,19880620,A,R,19880620,2019.0,
7,304688775,GLUSTEIN,CHAVA,G,,1912,,AVENUE N,BROOKLYN,11230,19730629,F,DEM,1,41,9,48,17,8,2,20000101,A,R,20080711,2018.0,718-645-1452
8,305289680,DAHAN,AVRAHAM,,,1912,2 FL,AVENUE N,BROOKLYN,11230,19840517,M,REP,1,41,9,48,17,8,2,20020708,A,R,20020708,2011.0,917-200-5499
9,301698215,HASS,MICKEY,,,1914,,AVENUE N,Brooklyn,11230,19281221,M,BLK,1,41,9,48,17,8,2,19921009,A,R,20160606,2002.0,
10,412769874,YANG,STEVE,Y,,1918,6A,AVENUE N,BROOKLYN,11230,19780315,M,DEM,1,41,9,48,17,8,2,20170316,A,R,20170320,,
11,303633425,NG,YUK KWAI,P,,1918,Num 4A,AVENUE N,BROOKLYN,11230,19480707,F,DEM,1,41,9,48,17,8,2,19970717,A,R,19970717,2018.0,
12,306003120,LANTSBERG,YAKOV,,,1918,4B,AVENUE N,BROOKLYN,11230,19500801,M,REP,1,41,9,48,17,8,2,20041008,A,R,20041008,2019.0,
13,304196331,NG,SHEUNG,M,,1918,Num 4A,AVENUE N,BROOKLYN,11230,19400210,M,DEM,1,41,9,48,17,8,2,19920925,A,R,19920925,2018.0,718-769-1511


In [None]:
County EMSID
Last Name
First Name
Middle Initial
Name Suffix
House Number 
Apartment Number
House Number Suffix # EXCLUDED
Street Name
City
Zip Code
Zip Code4 # EXCLUDED
Mailing Address 1 # EXCLUDED
Mailing Address 2 # EXCLUDED
Mailing Address 3 # EXCLUDED
Mailing Address 4 # EXCLUDED
Birth Date
Gender
Political Party 
Other Party # EXCLUDED
Election District 
Assembly District
Congress District
Council District
Senate District
Civil Court District
Judicial District                      
Registration Date
Status Code
Voter Type
Eff Status Change Date
Year Last Voted
Telephone (optional)
Future Party
Future Other Party
Future Party Effective Date


In [73]:
# Finding the line with the most information on a voter

len_list = []
i = 0

# Open and read the file
with open(filename, "r") as f:
    for line in itertools.islice(f,999999999):
        
        # Seperate the line on where 2 or more spaces are present
        new_line = re.split(r'\s{2,}', line)
        length = len(new_line)
        value = (i,length)
        len_list.append(value)
        
        i += 1
        
from operator import itemgetter

max(len_list,key=itemgetter(1))#[0]

(77075, 21)

In [83]:
i = 0

# Open and read the file
with open(filename, "r") as f:
    for line in itertools.islice(f,999999999):
        
        # Seperate the line on where 2 or more spaces are present
        new_line = re.split(r'\s{2,}', line)
        new_line = [x.lower() for x in new_line]
        # new_line = [x.replace('.', '') for x in new_line]
        
        lst = ['sr','jr','ii','iii','iv']
        
        if any(item in new_line for item in lst):
            print(i)
            print("\n")
            print(new_line)
            
        i += 1

19718


['306278803perkins', 'mckinley', 'iv', '1074', 'east 38 street', 'brooklyn', '11210', '19800616mdem', '03241094521080220061011a r200610112010414-737-1054', '']
35553


['305531661simon', 'roland', 'ii', '1342', 'east 35 street', 'brooklyn', '11210', '19850418mwor', '05041094517060220030922a r2003092220166462467288', '']
38530


['410644285wright', 'kevin', 'ii', '1655', 'c802', 'flatbush avenue', 'brooklyn', '11210', '19890915mdem', '05541094521060220090720a r200907202012', '']
45662


['303266259evangelista', 'pasquale', 'ii', '3105', '1c', 'avenue v', 'brooklyn', '11229', '19670111mblk', '06441094619080219961008a r199610082016', '']
53377


['303379296hofman', 'jack', 'iv', '2525', '3g', 'batchelder street', 'brooklyn', '112351123', '19500502mdem', '07341094819080219950407a r199504072018917-449-5615', '']
64103


['411994004molina', 'george', 'iv', '3080', '3e', 'voorhies avenue', 'brooklyn', '112351123', '19930225mdem', '08541094823080220150624a r', '']
78041


['301902125pi

627644


['411323249piro', 'christopher', 'ii', '500', '412', 'driggs avenue', 'brooklyn', '112111121', '19840305mdem', '04050123326030220121129a r201211292019', '']
663323


['410554324hasan', 'samuel', 'ii', '140', '3', 'washington avenue', 'brooklyn', '11205', '19780520fgre', '08950083525030220081020a r', '2012', '']
678791


['410706783laureano', 'daniel', 'ii', '629', '2fl', '48 street', 'brooklyn', '112201122', '19900103mdem', '00651073817050220091217a r20091217', '']
683270


['412279263otto', 'driscoll', 'iv', '346', '1f', '22 street', 'brooklyn', '112151121', '19771005mdem', '01351073821050220160328a r', '2016917-653-4490', '']
683287


['411390742heard', 'stanley', 'ii', '350', '1 f', '22 street', 'brooklyn', '11215', '19771004fdem', '01351073821050220121106a r20160606', '']
684675


['304966929risitano', 'richard', 'ii', '1038', '1', '41 street', 'brooklyn', '11219', '19800308mblk', '01551103917050220001027a r200010272008', '']
716578


['306147519leal', 'joseph', 'iv', '359

1003302


['413308918harper', 'aprendeakove', 'ii', '292', '5e', 'ralph avenue', 'brooklyn', '112331123', '19980312mblk', '02255084125040220181015a r20181016', '']
1008752


['413464387elwin', 'carlton', 'ii', '13', '1', 'radde place', 'brooklyn', '112331123', '19830611mblk', '03055084125040220190614a r20190617', '347-958-4374', '']
1014139


['412205467harrison', 'keith', 'ii', '92', 'apt 3', 'williams avenue', 'brooklyn', '112071123', '19850924mrep', '03855083719040220160307a r201603102016917-364-7302', '']
1015395


['412202645mcnair', 'gabriel', 'ii', '177', '2', 'jerome street', 'brooklyn', '112071120', '19950609mdem', '04555073718070220160307a r201810022018917-213-9804', '']
1049161


['305105933colvin', 'reginald', 'ii', '602', '3 a', 'howard avenue', 'brooklyn', '11212', '19821222mdem', '09355094120040220010809a r20160606', '']
1057075


['411582895doss', 'emma', 'ii', '1586', '1f', 'prospect place', 'brooklyn', '112331123', '19770618fdem', '10355094125040220140211a r2014021120

In [85]:
# Open and read the file
with open(filename, "r") as f:
    for line in itertools.islice(f, 0, 100):
        
        new_line = re.split(r'\s{2,}', line)
        
        print(new_line)

['413451129MICKENS', 'RAGIN', 'G', '1918', '3B', 'AVENUE', 'N', 'BROOKLYN', '112301123', '19920512MDEM', '00141094817080220190520A R201905212010', '']
['413594143SMITH', 'LIAM', 'D', '2016', '1H', 'AVENUE', 'N', 'BROOKLYN', '112101121', '20010712MBLK', '00141094817080220191126A R20191127', '']
['413494662BELSKY', 'RICHARD', 'D', '2016', '4E', 'AVENUE', 'N', 'BROOKLYN', '112101121', '19611104MBLK', '00141094817080220190814A R', '']
['413420512HIRSCH', 'DEBRA', '2122', 'AVENUE', 'N', 'BROOKLYN', '112101121', '19980410FREP', '00141094817080220190415A R20190417', '']
['410673045ZELINGER', 'JAY', 'I', '1904', 'AVENUE N', 'Brooklyn', '11230', '19460913FDEM', '00141094817080220090911A R20090911', '']
['411509548GLUSTEIN', 'CHARA', 'G', '1912', 'AVENUE N', 'Brooklyn', '11230', '19730629FBLK', '00141094817080220130822A R20130822', '']
['900130441GLUSTEIN', 'AHARON', 'M', '1912', '2FLR', 'AVENUE N', 'BROOKLYN', '11230', '19700208MDEM', '00141094817080219880620A R198806202019', '']
['304688775GLU

In [None]:
JR SR II III IV

In [42]:
qn_df.columns

Index(['County EMSID', 'Last Name', 'First Name', 'Middle Initial',
       'Name Suffix', 'House Number', 'Apartment Number', 'Street Name',
       'City', 'Zip Code', 'Date of Birth', 'Gender', 'Political Party',
       'Election District', 'Assembly District', 'Congress District',
       'Council District', 'Senate District', 'Civil Court District',
       'Judicial District', 'Registration Date', 'Status Code', 'Voter Type',
       'Eff Status Change Date', 'Year Last Voted', 'Telephone Number'],
      dtype='object')

## Final processing of dataframes

In [20]:
# Combining records to a single dataframe
qn_df_new = qn_df[qn_df['County EMSID'].notnull()]
bk_df_new = bk_df[bk_df['County EMSID'].notnull()]
nyc_df = pd.concat([qn_df_new, bk_df_new], ignore_index=True)

In [21]:
# Remove whitespace from column headers
nyc_df.columns = [str(x).replace(' ','') for x in list(nyc_df.columns)]
# replace 'Date of Birth' with 'DOB'
nyc_df.columns = ['DOB' if x=='DateofBirth' else x for x in list(nyc_df.columns)] 


In [22]:
# changing format of date columns to 'YYY-MM-DD'
nyc_df['DOB'] = nyc_df['DOB'].apply(lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8] if(pd.notnull(x)) else x)
nyc_df['RegistrationDate'] = nyc_df['RegistrationDate'].apply(lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8] if(pd.notnull(x)) else x)
nyc_df['EffStatusChangeDate'] = nyc_df['EffStatusChangeDate'].apply(lambda x: x[0:4] + '-' + x[4:6] + '-' + x[6:8] if(pd.notnull(x)) else x)

## Pushing data to MySQL 'voters' table in 'nyc' database

In [24]:
# Create a connection engine to the MySQL database
host="192.168.4.38"
user="monty"
password="Oldfreinds64!"
db="nyc"
port=3306

# conn = mysql.connector.connect(host="localhost",user="root",passwd="password",)
# conn = mysql.connector.connect(host=host,user=user,passwd=password)

conn = sqlalchemy.create_engine(f'mysql+mysqlconnector://{user}:{password}@{host}/{db}').connect()

nyc_df.to_sql(con=conn, name='voters', if_exists='append', index=False)
                                
conn.close()


In [25]:
nyc_df.head()

Unnamed: 0,CountyEMSID,LastName,FirstName,MiddleInitial,NameSuffix,HouseNumber,ApartmentNumber,StreetName,City,ZipCode,DOB,Gender,PoliticalParty,ElectionDistrict,AssemblyDistrict,CongressDistrict,CouncilDistrict,SenateDistrict,CivilCourtDistrict,JudicialDistrict,RegistrationDate,StatusCode,VoterType,EffStatusChangeDate,YearLastVoted,TelephoneNumber
0,Q1205093,ABBRACCIAMENTO,FRANCES,,,102,,BEACH 221 STREET,BREEZY POINT,11697,1913-05-09,F,DEM,1,23,5,32,15,5,11,1984-01-01,A,P,1984-01-01,2019,
1,303652678,ADESSO,FRANK,,,113,,BEACH 215 STREET,BREEZY POINT,11697,1938-05-14,M,BLK,1,23,5,32,15,5,11,1992-10-09,A,R,1992-10-09,2011,
2,303651745,ADESSO,MARY,,,113,,BEACH 215 STREET,BREEZY POINT,11697,1947-05-21,F,BLK,1,23,5,32,15,5,11,1989-03-27,A,R,1989-03-27,2016,
3,02610005,AHERN,ELIZABETH,,,51,,BEACH 217 STREET,BREEZY POINT,11697,1942-03-12,F,REP,1,23,5,32,15,5,11,1973-01-01,A,R,1973-01-01,2018,
4,Q1197516,AHERN,MARGARET,M,,107,,BEACH 222 STREET,BREEZY POINT,11697,1965-05-10,F,BLK,1,23,5,32,15,5,11,1984-01-01,A,R,1984-01-01,2018,


## Adding coordinates to records

In [133]:
# Adding a row of addresses to dataframe

def add_st_suffix(x):
    '''
    Takes in a street name in string form and appends
    'st', 'nd', or 'rd' to the end of the digit.
    Returns:
        The original string with 'st', 'nd', or 'rd' appended to the digit.
    '''
    pattern = re.findall(r'\d+ [Ss][Tt][Rr][Ee][Ee][Tt]', x)
    if(len(pattern) > 0):
        street_num = pattern[-1].split(' ')[0]
        digit = list(street_num)[-1]
        if(digit in ['0','4','5','6','7','8','9']):
            y = x.replace(street_num, street_num + 'th')
        elif(digit == '1'):
            y = x.replace(street_num, street_num + 'st')
        elif(digit == '2'):
            y = x.replace(street_num, street_num + 'nd')
        return y
    else:
        return x

nyc_df['geocode_col'] = nyc_df['HouseNumber'].astype(str) + ',' + \
                nyc_df['StreetName'].apply(lambda x: add_st_suffix(x)) + ',' + \
                nyc_df['ZipCode'] + ',' + \
                nyc_df['City'] + ',' + 'United States'   

In [138]:
# Commencing geocoding
locator = Nominatim(user_agent="myGeocoder")
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
nyc_df['geo_location'] = nyc_df['geocode_col'].apply(geocode)
nyc_df['geo_point'] = nyc_df['geo_location'].apply(lambda loc: tuple(loc.point) if loc else None)

In [None]:
# Checking for records that were not geocoded
# nyc_df[nyc_df['geocode_col'].isnull()]

# Remove rows where the geocode_col is null
# nyc_df = nyc_df[~nyc_df['geocode_col'].isnull()]

In [148]:
# Processing the geocoded records
nyc_df[['latitude', 'longitude', 'altitude']] = pd.DataFrame(nyc_df['geo_point'].tolist(), index=nyc_df.index)

In [150]:
nyc_df = nyc_df.drop(['geocode_col','geo_location','geo_point','altitude'], axis=1)

In [157]:
# Adding the ID column 
nyc_df.insert(loc=0, column='ID', value=np.arange(len(nyc_df))+1)

In [45]:
# # Writing the addresses dataframe to a csv file
# save_path = '/Users/mareksalamon/Desktop/Politiker/Code/App Demo/Politiker/public/data/'
# addresses.to_csv(save_path + 'nyc_addresses.csv', header=True, index=False)

# def map_coors(filepath, df):

#     '''
#     Takes in a file with coordinates and attaches them to a specified dataframe. The two are joined on the row index.
#     The file with the coordinates must be obtained from https://geocode.localfocus.nl/
#     '''

#     coors_df = pd.read_csv(filepath, header=None, sep='\t')
#     coors_df = coors_df.drop(0, 1).drop(3, 1)
#     coors_df.columns = ['Latitude','Longitude']
#     merged_df = pd.merge(left=df, right=coors_df, how='left', left_index=True, right_index=True)
#     return(merged_df)


# nyc_df = map_coors('data/nyc_coors.csv', nyc_df)

## Turning the data into a JSON file

In [167]:
print(len(nyc_df))
print(len(nyc_df[~(nyc_df['latitude'].isnull() | nyc_df['longitude'].isnull())]))

1197
1195


In [168]:
# Checking where latitude or longitude are null
# nyc_df[nyc_df['latitude'].isnull() | nyc_df['longitude'].isnull()]

# Removing rows where latitude or longitude are null
# nyc_df = nyc_df[~(nyc_df['latitude'].isnull() | nyc_df['longitude'].isnull())].reset_index(drop=True)

In [170]:
nyc_df.columns

Index(['ID', 'CountyEMSID', 'LastName', 'FirstName', 'MiddleInitial',
       'NameSuffix', 'HouseNumber', 'ApartmentNumber', 'StreetName', 'City',
       'ZipCode', 'DOB', 'Gender', 'PoliticalParty', 'ElectionDistrict',
       'AssemblyDistrict', 'CongressDistrict', 'CouncilDistrict',
       'SenateDistrict', 'CivilCourtDistrict', 'JudicialDistrict',
       'RegistrationDate', 'StatusCode', 'VoterType', 'EffStatusChangeDate',
       'YearLastVoted', 'TelephoneNumber', 'latitude', 'longitude'],
      dtype='object')

In [171]:
# Change the column names back so that they have spaces
nyc_df.columns = ['ID', 'County EMSID', 'Last Name', 'First Name', 'Middle Initial',
       'Name Suffix', 'House Number', 'Apartment Number', 'Street Name', 'City',
       'Zip Code', 'DOB', 'Gender', 'Political Party', 'Election District',
       'Assembly District', 'Congress District', 'Council District',
       'Senate District', 'Civil Court District', 'Judicial District',
       'Registration Date', 'Status Code', 'Voter Type', 'Eff Status Change Date',
       'Year Last Voted', 'Telephone Number', 'Latitude', 'Longitude']

In [179]:
nyc_df.to_json(root + '/qn_bk_data.json', orient='index')