In [1]:
import pandas as pd
import numpy as np
import warnings
import time
import os

warnings.filterwarnings('ignore')

## Initialize <a id='initialize'></a>
This function is used to get, and extract all needed information from the processed/static/ folder.
It extracts the candidates who are either **SENATOR**, **GOVERNOR**, **VICE-GOVERNOR**, **MAYOR**, **VICE-MAYOR**, and **PARTY LIST**. It will also output a dataframe that will be used for mapping of the precincts and their respective **region**, **province**, and **municipality**.

**Inputs:**
* unprocessed_dir (string) > directory of the unprocessed results.csv
* processed_dir (string) > directory of the processed data

**Outputs:**
* codes (dataframe) > dataframe of the contest_codes of interest
* candidates (dataframe) > dataframe of the candidates with contest_codes of interest
* precincts (dataframe) > dataframe of the precincts

In [10]:
def initialize(unprocessed_dir, processed_dir):
    contest = pd.read_csv(os.path.join(processed_dir, 'static/contests.csv'), encoding = 'utf-8')
    positions = ['SENATOR', 'GOVERNOR', 'VICE-GOVERNOR', 'MAYOR', 'VICE-MAYOR','PARTY LIST']
    codes = []
    for pos in positions:
        mask = contest['CONTEST_NAME'].str.contains(pos)
        codes_1 = contest.loc[mask]
        codes.append(codes_1)
    codes = pd.concat(codes)

    candidates = pd.read_csv(os.path.join(processed_dir, 'static/candidates.csv'), encoding = 'utf-8')
    candidates = candidates.loc[candidates['CONTEST_CODE'].isin(codes['CONTEST_CODE'].values)]
    precincts = pd.read_csv(os.path.join(processed_dir, 'static/precincts.csv'), encoding = 'utf-8', dtype = {'VCM_ID':str})

    return codes, candidates, precincts

## Preparation of Results <a id = 'prep'></a>
This function prepares the results by mapping the precincts to its region, province, and municipality, and removes the oversee absentee voters.

**Inputs**:
* unprocessed_dir (string) > directory of the unprocessed results.csv
* precincts (dataframe) > dataframe of the precincts

**Outputs**:
* df (dataframe) > dataframe of the results.csv

In [4]:
def prep_results(unprocessed_dir, precincts):
    df = pd.read_csv(os.path.join(unprocessed_dir,'results.csv'), encoding = 'utf-8', dtype = {'PRECINCT_CODE':str})
    df = df.merge(precincts, left_on = 'PRECINCT_CODE', right_on = 'VCM_ID', how = 'left')
    df = df.loc[df['REG_NAME'] != 'OAV']

    return df

## Candidate Vote Count <a id = 'vote_count'></a>
Summarizes the vote counts per candidate depending on the level (regional, provincial, municipal).

**Inputs:**
* df (dataframe) > dataframe of the results
* col (string) > column name representing the level of summarizing ('REG_NAME':regional, 'PRV_NAME':provincial, 'MUN_NAME':municipal)
* candidates (dataframe) > dataframe of the candidates with contest_codes of interest

**Outputs:**
* summarized_candidate (dataframe) > dataframe of the summarized vote count

In [11]:
def summarize_candidate(df, col, candidates):
    summarized_candidate = pd.DataFrame(index = df[col].unique().tolist(), columns = candidates)
    grouped = df.groupby([col, 'CANDIDATE_NAME'])
    keys = grouped.groups.keys()
    for key in keys:
        a = grouped.get_group(key)
        summarized_candidate.loc[key[0]][key[1]] = a.VOTES_AMOUNT.sum()
    summarized_candidate.index.name = col

    return summarized_candidate.reset_index()

## Column Summarizer <a id = 'summarize'></a>
Summarizes the following columns: NUMBER_VOTERS, UNDERVOTE, OVERVOTE

**Inputs:**
* df (dataframe) > dataframe of the results
* col (string) > column name representing the level of summarizing ('REG_NAME':regional, 'PRV_NAME':provincial, 'MUN_NAME':municipal)
* cols (list) > list of column names to summarize (stated above)

**Outputs:**
* summarized (datafrmae) > dataframe of the summarized columns

In [6]:
def summarize(df, col, cols):
    summarized = pd.DataFrame(index = df[col].unique().tolist(), columns = cols)
    grouped = df.groupby(col)
    keys = grouped.groups.keys()
    for key in keys:
        a = grouped.get_group(key)
        a.drop_duplicates(subset = ['PRECINCT_CODE'], keep = 'first', inplace = True)
        for i in cols:
            summarized.loc[key][i] = a[i].sum()
    summarized.index.name = col
    return summarized.reset_index()

In [7]:
def save_file(df, outfile):
    print('Saving file')
    df.to_csv(outfile, encoding = 'utf-8', index = False)

In [8]:
def add_info(df, level, candidates, codes, outfile = False):
    df['LEVEL'] = level
    df = df.merge(candidates, left_on = 'variable', right_on = 'CANDIDATE_NAME', how = 'left')
    df = df.merge(codes, left_on = 'CONTEST_CODE', right_on = 'CONTEST_CODE', how = 'left')
    if outfile:
        save_file(df, outfile)

In [9]:
def transform(df, id_vars, value_vars, outfile = False):
    transformed = df.melt(id_vars = id_vars, value_vars=value_vars)
    if outfile:
        save_file(transformed, outfile)
    return transformed