# This code is to process data 

In [1]:
# Load the Pandas libraries with alias 'pd' 
import pandas as pd
import numpy as np
#import math


# Read data from file 'filename.csv' 
# (in the same directory that your python process is based)
# Control delimiters, rows, column names with read_csv (see later) 

predefined_head = ['province', 'district', 'one_aye', 'one_aye100', 'one_nay', 'one_nay100', \
                   'two_aye', 'two_aye100', 'two_nay', 'two_nay100'];
table = pd.read_csv("tabula-results2.csv", names = predefined_head) 
# Preview the first 5 lines of the loaded data 
table.head()

Unnamed: 0,province,district,one_aye,one_aye100,one_nay,one_nay100,two_aye,two_aye100,two_nay,two_nay100
0,จงหวด,อําเภอ/เขต,ประเดนท ่1 รํางรฐธรรมนูญ,ประเดนท ่2 คําถํามเพ่ิมเตม,,,,,,
1,,,เหนชอบ,รอยละ,ไมเหนชอบ,รอยละ,เหนชอบ,รอยละ,ไมเหนชอบ,รอยละ
2,กรุงเทพมหานคร,คลองเตย,23475,70.28,9928,29.72,22171,67.13,10858,32.87
3,คลองสาน,20990,72.54,7947,27.46,19660,68.63,8987,31.37,
4,คลองสามวา,50073,68.6,22921,31.4,46745,65.17,24983,34.83,


Process the data by
1. re-adjust the cell to the correct table structure

In [2]:
# Drop a row by condition

table2 = table[~((table.one_nay).isna() | (table.district).isna() | (table.province.isna() & table.two_nay100.isna()))].copy()
table2 = table2.iloc[0:-1]

table3 = table2.copy()
table3.loc[table3.two_nay100.isna()] = table2.loc[table2.two_nay100.isna()].shift(periods=1, axis='columns')

#table3.tail(100)



2. mark if the row is summation / out-site

In [3]:
# check if it is a summation row?
sum_row = table3['district'].str.find('ผลรวม')
sum_row[sum_row<0] = 0
table3['sum_row'] = sum_row
table3['sum_row'] = table3['sum_row'].astype('bool')

# check if it is a out-site row?
ext_row = table3['district'].str.find('อกเขตจงหว')
ext_row[ext_row<0] = 0
table3['ext_row'] = ext_row
table3['ext_row'] = table3['ext_row'].astype('bool')



3. re-label the summation row and out-site

In [4]:
table3.loc[table3[table3['ext_row']==1].index,'district'] = 'นอกเขต'
table3.loc[table3[table3['sum_row']==1].index,'district'] = 'ผลรวม'

#table3.tail(100)

4. repeat the province cells

In [5]:
table3.fillna(method='ffill', inplace=True)
#table3.tail(100)

# correct the typos
## correct provinces

In [6]:
from fuzzywuzzy import process


def match_names(wrong, correct):
    names_array = []
    ratio_array = []
    for row in wrong:
        x=process.extractOne(row, correct)
        names_array.append(x[0])
        ratio_array.append(x[1])
    return names_array, ratio_array

## correct the provinces
# wrong data set

table4 = table3.copy()
wrong_names = table4['province'].values



In [7]:
# correct data set
choices_df  = pd.read_csv('../district/prov_list.csv')
correct_names=choices_df['PRV_NAME_TH'].values

In [8]:
name_match, ratio_match = match_names(wrong_names, correct_names)

table4['province'] = pd.Series(name_match, index=table3.index)

#table4.tail()


## correct districts

In [None]:
## correct the districts
# wrong data set
wrong_names = table3[~(table3['sum_row'] | table3['ext_row'])]['district'].values

# correct data set
choices_df  = pd.read_csv('../district/dist_list.csv')
correct_names=choices_df['AMP_NAME_TH'].values


name_match, ratio_match = match_names(wrong_names, correct_names)

ind = table3[~(table3['sum_row'] | table3['ext_row'])]['district'].index
table4.loc[ind, 'district'] = pd.Series(name_match, index=ind)

## show data
table4.sample(10)

# Export CSV

In [48]:
ind2 = table3[~table3['sum_row']]['district'].index
table4.loc[ind2, predefined_head].to_csv(r'results_by_districts.csv', index = None, header=True)

ind2 = table3[table3['sum_row']]['district'].index
table4.loc[ind2, predefined_head].to_csv(r'results_by_prov.csv', index = None, header=True)