# Preprocess Images

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


params.py

In [2]:
AUTOENCODER_WIDTH=160
AUTOENCODER_HEIGHT=192
STATE_COLUMN_NAME = ['SG_UF']
YEAR_COLUMN_NAME = ['ANO_ELEICAO']
ID_COLUMN_NAME = ['SQ_CANDIDATO']
ELLECTED_COLUMN_NAME = ['DS_SIT_TOT_TURNO']
COLUMN_NAMES = ['NM_CANDIDATO', 'DS_CARGO'] + ID_COLUMN_NAME + ELLECTED_COLUMN_NAME + ['CD_SIT_TOT_TURNO'] + YEAR_COLUMN_NAME + STATE_COLUMN_NAME
FILENAME_COLUMN_NAME = ['filename']
FACE_COLUMN_NAME = ['face']
COLUMN_NAMES_FULL = COLUMN_NAMES + FILENAME_COLUMN_NAME
states = ['AC', 'AL', 'AP', 'AM', 'BA', 'CE', 'ES', 'GO', 'MA', 'MT', 'MS', 'MG', 'PA', 'PB', 'PR', 'PE', 'PI', 'RJ', 'RN', 'RS', 'RO', 'RR', 'SC', 'SP', 'SE', 'TO', 'DF']
ELEITO = ['ELEITO', 'ELEITO POR MÉDIA']
NAO_ELEITO = ['NÃO ELEITO']
CHUNK_SIZE = 10000
LOCAL_DATA_PATH_SRC='/content/gdrive/MyDrive/Bootcamp_ENAP_2022/new_input'
LOCAL_DATA_PATH_CSV='/content/gdrive/MyDrive/Bootcamp_ENAP_2022/new_output/csv'
LOCAL_DATA_PATH_INPUT_IMG='/content/gdrive/MyDrive/Bootcamp_ENAP_2022/new_output/raw_img'
LOCAL_DATA_PATH_OUTPUT_IMG='/content/gdrive/MyDrive/Bootcamp_ENAP_2022/new_output/processed_img'


face_detection.py

In [3]:
import cv2
import numpy as np
import os

def crop_face(face):
  face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_alt.xml')
  gray = None
  if len(face.shape) == 3:
    gray = cv2.cvtColor(face, cv2.COLOR_RGB2GRAY)
  else:
    gray = face = np.expand_dims(face, axis=-1)
  faces = face_cascade.detectMultiScale(gray, 1.3, 5)
  if len(faces):
    (x,y,w,h) = faces[0]
    w_slack, h_slack = w//2, h//2
    img_cropped = face[
        max(0,y-h_slack):
        min(face.shape[0]-1,y+h+h_slack),
        max(0, x-w_slack):
        min(face.shape[1]-1,x+w+w_slack),
        :]
    return img_cropped
  else:
    square = min(face.shape[0], face.shape[1])//2
    mid_height = face.shape[0]//2
    mid_width = face.shape[1]//2
    return face[mid_height-square:mid_height+square,mid_width-square:mid_width+square]

def resize_face(face, x_max=AUTOENCODER_WIDTH, y_max=AUTOENCODER_HEIGHT):
  scale = min(y_max/face.shape[0], x_max/face.shape[1])
  return cv2.resize(face, None, fx=scale, fy=scale, interpolation=cv2.INTER_AREA)

def pad_face(face, x_max=AUTOENCODER_WIDTH, y_max=AUTOENCODER_HEIGHT):
  delta_w = x_max - face.shape[1]
  delta_h = y_max - face.shape[0]
  top, bottom = delta_h//2, delta_h-(delta_h//2)
  left, right = delta_w//2, delta_w-(delta_w//2)
  return cv2.copyMakeBorder(face, top, bottom, left, right, cv2.BORDER_CONSTANT, value=[255, 255, 255])

def gray_face(face):
  return cv2.cvtColor(face, cv2.COLOR_BGR2GRAY)

def is_gray(face):
    if len(face.shape) < 3: return True
    if face.shape[2]  == 1: return True
    b,g,r = face[:,:,0], face[:,:,1], face[:,:,2]
    if (b==g).all() and (b==r).all(): return True
    return False

utils.py

In [4]:
import os

def get_img_filename(year: str, state: str, sq_candidato: str):

  path = os.path.join(LOCAL_DATA_PATH_INPUT_IMG, year)
  options = ['F'+state+sq_candidato+'_div.jpg', 'F'+state+sq_candidato+'_div.jpeg',
             'F'+state+sq_candidato+'.jpg', 'F'+state+sq_candidato+'.jpeg']

  for option in options:
    full_path = os.path.join(path, option)
    if os.path.exists(full_path): return full_path

  return None

local_disk.py

In [5]:
import re
import os
import pandas as pd
import numpy as np
import zipfile

def open_local_image(path):
  img = cv2.imread(path, cv2.COLOR_BGR2RGB)
  return img

def save_local_image(filename: str, face, bw: bool, eleito: bool):
  folder = None

  if bw:
    folder = os.path.join(LOCAL_DATA_PATH_OUTPUT_IMG,
                          'bw', 'elected' if eleito else 'not_elected')
  else:
    folder = os.path.join(LOCAL_DATA_PATH_OUTPUT_IMG,
                          'color', 'elected' if eleito else 'not_elected')

  if not os.path.exists(folder):
    os.makedirs(folder)
  cv2.imwrite(os.path.join(folder, filename), face)

  return None

def get_pandas_chunk(year: str,
                     state: str,
                     index: int,
                     chunk_size: int,
                     verbose=True) -> pd.DataFrame:
    """
    return a chunk of the raw dataset from local disk or cloud storage
    """

    full_path = os.path.join(
        LOCAL_DATA_PATH_CSV,
        year,
        f"consulta_cand_{year}_{state}.csv")

    if verbose:
        print(f"Source data from {full_path}: {chunk_size if chunk_size is not None else 'all'} rows (from row {index})")

    try:
        df = pd.read_csv(
                full_path,
                skiprows=np.arange(1, index+1),  # skip header
                nrows=chunk_size,
                header=0,
                encoding='iso-8859-1',
                on_bad_lines='warn',
                sep=';',
                usecols=COLUMN_NAMES)  # read all rows


        df[FILENAME_COLUMN_NAME[0]] = df[ID_COLUMN_NAME[0]].map(lambda id_candidato: get_img_filename(year, state, str(id_candidato)))

    except pd.errors.EmptyDataError:
        return None  # end of data

    return df

def save_local_chunk(data: pd.DataFrame):
    """
    save a chunk of the dataset to local disk
    """

    for ind in data.index:
        state = data[STATE_COLUMN_NAME[0]][ind]
        year = str(data[YEAR_COLUMN_NAME[0]][ind])
        eleito = data[ELLECTED_COLUMN_NAME[0]][ind]

        if eleito not in (ELEITO + NAO_ELEITO):
            continue

        sq_candidato = str(data[ID_COLUMN_NAME[0]][ind])
        face = data[FACE_COLUMN_NAME[0]][ind]

        if is_gray(face):
            save_local_image(year+'F'+state+str(sq_candidato)+'_div.jpg', face, True, eleito in ELEITO)
        else:
            save_local_image(year+'F'+state+str(sq_candidato)+'_div.jpg', face, False, eleito in ELEITO)
            save_local_image(year+'F'+state+str(sq_candidato)+'_div.jpg', gray_face(face), True, eleito in ELEITO)


def extract_local_files() -> dict:
    def unzip_local_files(year: str, filename: str, csv: bool):
        from_path = os.path.join(
            LOCAL_DATA_PATH_SRC,
            year,
            filename)

        zip_ref = zipfile.ZipFile(from_path, 'r')

        to_path = os.path.join(
            LOCAL_DATA_PATH_CSV if csv else LOCAL_DATA_PATH_INPUT_IMG,
            year)

        zip_ref.extractall(to_path)
        zip_ref.close()

        if csv:
            states_found = []
            state_str = '|'.join(states)
            extracted_filenames = os.listdir(to_path)
            for extracted_filename in extracted_filenames:
                match = re.match(rf'.*({state_str}).csv$', extracted_filename)
                if match is not None:
                    states_found.append(match.group(1))
                    print(f"{year}: ✅ found state {match.group(1)} to preprocess 👌")
            return states_found

    years = os.listdir(LOCAL_DATA_PATH_SRC)
    result = dict()
    for year in years:
        match = re.match(r'(\d+)', year)
        if match is not None:
            print(f"✅ found year {match.group(1)} to preprocess 👌")
            src_year_folder = os.path.join(LOCAL_DATA_PATH_SRC, year)
            zipped_files = os.listdir(src_year_folder)
            for zipped_file in zipped_files:
                if zipped_file.startswith('consulta') and zipped_file.endswith('.zip'):
                    result[year] = unzip_local_files(year, zipped_file, csv=True)
                elif zipped_file.startswith('foto') and zipped_file.endswith('.zip'):
                    unzip_local_files(year, zipped_file, csv=False)
    return result

def load_local_chunk_images(df: pd.DataFrame) -> np.ndarray:
  def open_local_images(filename: str) -> list:
    return pad_face(resize_face(crop_face(open_local_image(filename))))
  df[FACE_COLUMN_NAME[0]] = df[FILENAME_COLUMN_NAME[0]].map(open_local_images)
  return df


def get_pandas_chunk(year: str,
                     state: str,
                     index: int,
                     chunk_size: int,
                     verbose=True) -> pd.DataFrame:
    """
    return a chunk of the raw dataset from local disk or cloud storage
    """

    full_path = os.path.join(
        LOCAL_DATA_PATH_CSV,
        year,
        f"consulta_cand_{year}_{state}.csv")

    if verbose:
        print(f"Source data from {full_path}: {chunk_size if chunk_size is not None else 'all'} rows (from row {index})")

    try:
        df = pd.read_csv(
                full_path,
                skiprows=np.arange(1, index+1),  # skip header
                nrows=chunk_size,
                header=0,
                encoding='iso-8859-1',
                on_bad_lines='warn',
                sep=';',
                usecols=COLUMN_NAMES)  # read all rows


        df[FILENAME_COLUMN_NAME[0]] = df[ID_COLUMN_NAME[0]].map(lambda id_candidato: get_img_filename(year, state, str(id_candidato)))

    except pd.errors.EmptyDataError:
        return None  # end of data

    return df

def save_local_chunk(data: pd.DataFrame):
    """
    save a chunk of the dataset to local disk
    """

    for ind in data.index:
        state = data[STATE_COLUMN_NAME[0]][ind]
        year = str(data[YEAR_COLUMN_NAME[0]][ind])
        eleito = data[ELLECTED_COLUMN_NAME[0]][ind]

        if eleito not in (ELEITO + NAO_ELEITO):
            continue

        sq_candidato = str(data[ID_COLUMN_NAME[0]][ind])
        face = data[FACE_COLUMN_NAME[0]][ind]

        if is_gray(face):
            save_local_image(year+'F'+state+str(sq_candidato)+'_div.jpg', face, True, eleito in ELEITO)
        else:
            save_local_image(year+'F'+state+str(sq_candidato)+'_div.jpg', face, False, eleito in ELEITO)
            save_local_image(year+'F'+state+str(sq_candidato)+'_div.jpg', gray_face(face), True, eleito in ELEITO)


def extract_local_files() -> dict:
    def unzip_local_files(year: str, filename: str, csv: bool):
        from_path = os.path.join(
            LOCAL_DATA_PATH_SRC,
            year,
            filename)

        zip_ref = zipfile.ZipFile(from_path, 'r')

        to_path = os.path.join(
            LOCAL_DATA_PATH_CSV if csv else LOCAL_DATA_PATH_INPUT_IMG,
            year)

        zip_ref.extractall(to_path)
        zip_ref.close()

        if csv:
            states_found = []
            state_str = '|'.join(states)
            extracted_filenames = os.listdir(to_path)
            for extracted_filename in extracted_filenames:
                match = re.match(rf'.*({state_str}).csv$', extracted_filename)
                if match is not None:
                    states_found.append(match.group(1))
                    print(f"{year}: ✅ found state {match.group(1)} to preprocess 👌")
            return states_found

    years = os.listdir(LOCAL_DATA_PATH_SRC)
    result = dict()
    for year in years:
        match = re.match(r'(\d+)', year)
        if match is not None:
            print(f"✅ found year {match.group(1)} to preprocess 👌")
            src_year_folder = os.path.join(LOCAL_DATA_PATH_SRC, year)
            zipped_files = os.listdir(src_year_folder)
            for zipped_file in zipped_files:
                if zipped_file.startswith('consulta') and zipped_file.endswith('.zip'):
                    result[year] = unzip_local_files(year, zipped_file, csv=True)
                elif zipped_file.startswith('foto') and zipped_file.endswith('.zip'):
                    unzip_local_files(year, zipped_file, csv=False)
    return result

def load_local_chunk_images(df: pd.DataFrame) -> np.ndarray:
  def open_local_images(filename: str) -> list:
    return pad_face(resize_face(crop_face(open_local_image(filename))))
  df[FACE_COLUMN_NAME[0]] = df[FILENAME_COLUMN_NAME[0]].map(open_local_images)
  return df


data.py

In [6]:
import pandas as pd
import numpy as np

def clean_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    clean raw data by removing buggy or irrelevant transactions
    or columns for the training set
    """

    # remove useless/redundant columns
    df = df[COLUMN_NAMES_FULL]
    df = df.dropna(subset=FILENAME_COLUMN_NAME)

    return df

def get_chunk(year: str,
              state: str,
              index: int = 0,
              chunk_size: int = None,
              verbose=False) -> pd.DataFrame:
    """
    Return a `chunk_size` rows from the source dataset, starting at row `index` (included)
    Always assumes `source_name` (CSV or Big Query table) have headers,
    and do not consider them as part of the data `index` count.
    """

    # if os.environ.get("DATA_SOURCE") == "big query":

    #     chunk_df = get_bq_chunk(table=source_name,
    #                             index=index,
    #                             chunk_size=chunk_size,
    #                             dtypes=dtypes,
    #                             verbose=verbose)

    #     return chunk_df

    chunk_df = get_pandas_chunk(year=year,
                                state=state,
                                index=index,
                                chunk_size=chunk_size,
                                verbose=verbose)

    return chunk_df

def save_chunk(data: pd.DataFrame) -> None:
    """
    save chunk
    """

    # if os.environ.get("DATA_SOURCE") == "big query":

    #     save_bq_chunk(table=destination_name,
    #                   data=data,
    #                   is_first=is_first)

    #     return

    save_local_chunk(data=data)

def extract_files() -> dict:
    """
    extract files
    """

    # if os.environ.get("DATA_SOURCE") == "big query":
    #     ...
    #     return

    return extract_local_files()

def load_chunk_images(df: pd.DataFrame) -> np.ndarray:
    """
    load chunk images
    """

    # if os.environ.get("DATA_SOURCE") == "big query":
    #     ...
    #     return

    return load_local_chunk_images(df)


In [7]:
def preprocess():
    year_and_states_list = extract_files()
    all_chunk_count = 0
    all_rows_count = 0
    all_cleaned_rows_count = 0

    for year, states_list in year_and_states_list.items():
        year_chunk_count = 0
        year_rows_count = 0
        year_cleaned_rows_count = 0

        for state in states_list:

            # iterate on the dataset, by chunks
            chunk_id = 0
            row_count = 0
            cleaned_row_count = 0

            while (True):

                print(f"\n{year}, {state}: Processing chunk n°{chunk_id}...")

                data_chunk = get_chunk(year=year,
                                    state=state,
                                    index=chunk_id * CHUNK_SIZE,
                                    chunk_size=CHUNK_SIZE)

                # Break out of while loop if data is none
                if data_chunk is None:
                    print(f"{year}, {state}: No data in latest chunk...")
                    break

                row_count += data_chunk.shape[0]

                data_chunk_cleaned = clean_data(data_chunk)

                cleaned_row_count += len(data_chunk_cleaned)

                # break out of while loop if cleaning removed all rows
                if len(data_chunk_cleaned) == 0:
                    print(f"{year}, {state}: ❌ No cleaned data in latest chunk...")
                    break
                else:
                    print(f"{year}, {state}: ✅ data cleaned")

                images_processed_chunk = load_chunk_images(data_chunk_cleaned)

                save_chunk(images_processed_chunk)

                chunk_id += 1

            if row_count == 0:
                print(f"{year}, {state}: ✅ no new data for the preprocessing 👌")
                break

            print(f"{year}, {state}: ✅ data processed saved entirely: {row_count} rows ({cleaned_row_count} cleaned)")
            year_chunk_count += chunk_id
            year_rows_count += row_count
            year_cleaned_rows_count += cleaned_row_count

        print(f"{year}: ✅ data processed saved entirely: {year_chunk_count} chunks {year_rows_count} rows ({year_cleaned_rows_count} cleaned)")
        all_chunk_count += year_chunk_count
        all_rows_count += year_rows_count
        all_cleaned_rows_count += year_cleaned_rows_count

    print(f"✅ data processed saved entirely: {all_chunk_count} chunks {all_rows_count} rows ({all_cleaned_rows_count} cleaned)")
    return None


In [None]:
preprocess()

✅ found year 2022 to preprocess 👌
2022: ✅ found state AC to preprocess 👌
2022: ✅ found state AL to preprocess 👌
2022: ✅ found state AM to preprocess 👌
2022: ✅ found state AP to preprocess 👌
2022: ✅ found state BA to preprocess 👌
2022: ✅ found state CE to preprocess 👌
2022: ✅ found state DF to preprocess 👌
2022: ✅ found state ES to preprocess 👌
2022: ✅ found state GO to preprocess 👌
2022: ✅ found state MA to preprocess 👌
2022: ✅ found state MG to preprocess 👌
2022: ✅ found state MS to preprocess 👌
2022: ✅ found state MT to preprocess 👌
2022: ✅ found state PA to preprocess 👌
2022: ✅ found state PB to preprocess 👌
2022: ✅ found state PE to preprocess 👌
2022: ✅ found state PI to preprocess 👌
2022: ✅ found state PR to preprocess 👌
2022: ✅ found state RJ to preprocess 👌
2022: ✅ found state RN to preprocess 👌
2022: ✅ found state RO to preprocess 👌
2022: ✅ found state RR to preprocess 👌
2022: ✅ found state RS to preprocess 👌
2022: ✅ found state SC to preprocess 👌
2022: ✅ found state SE to prep