In [None]:
!pip install geopy

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from dotenv import load_dotenv
import os
import requests

from geopy.distance import geodesic


import warnings
warnings.filterwarnings("ignore")

In [5]:
crosschecked_df = pd.read_csv('./Data/Nigeria Presidential Election 2023 Electoral Sheets Collation/ABIA_crosschecked.csv')
notfound_df = pd.read_csv('./Data/Nigeria Presidential Election 2023 Electoral Sheets Collation/ABIA_notfound.csv')
unsure_df = pd.read_csv('./Data/Nigeria Presidential Election 2023 Electoral Sheets Collation/ABIA_unsure.csv')

# Combine the dataframes if necessary
combined_df = pd.concat([crosschecked_df, notfound_df, unsure_df], ignore_index=True)
combined_df.head()

Unnamed: 0,State,LGA,Ward,PU-Code,PU-Name,Accredited_Voters,Registered_Voters,Results_Found,Transcription_Count,Result_Sheet_Stamped,Result_Sheet_Corrected,Result_Sheet_Invalid,Result_Sheet_Unclear,Result_Sheet_Unsigned,APC,LP,PDP,NNPP,Results_File
0,ABIA,ABA NORTH,EZIAMA,01-01-01-001,RAILWAY QUARTERS - RAILWAY QUARTERS I,85,968,True,-1,True,True,False,False,UNKNOWN,7,56,25,1,https://docs.inecelectionresults.net/elections...
1,ABIA,ABA NORTH,EZIAMA,01-01-01-002,RAILWAY QUARTERS - RAILWAY QUARTERS II,90,750,True,-1,False,False,False,False,UNKNOWN,0,0,0,0,https://docs.inecelectionresults.net/elections...
2,ABIA,ABA NORTH,EZIAMA,01-01-01-003,RAILWAY QUARTERS - RAILWAY QUARTERS III,105,750,True,-1,False,False,False,False,UNKNOWN,0,0,0,0,https://docs.inecelectionresults.net/elections...
3,ABIA,ABA NORTH,EZIAMA,01-01-01-005,ABIA POLY - ABIA POLY I,138,750,True,-1,False,False,False,False,UNKNOWN,0,0,0,0,https://docs.inecelectionresults.net/elections...
4,ABIA,ABA NORTH,INDUSTRIAL AREA,01-01-02-012,LEVER BROTHERS GATE- LEVER BROTHERS GATE IV,71,774,True,-1,False,False,False,False,UNKNOWN,0,71,0,0,https://docs.inecelectionresults.net/elections...


In [6]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4062 entries, 0 to 4061
Data columns (total 19 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   State                   4062 non-null   object
 1   LGA                     4062 non-null   object
 2   Ward                    4062 non-null   object
 3   PU-Code                 4062 non-null   object
 4   PU-Name                 4062 non-null   object
 5   Accredited_Voters       4062 non-null   int64 
 6   Registered_Voters       4062 non-null   int64 
 7   Results_Found           4062 non-null   object
 8   Transcription_Count     4062 non-null   int64 
 9   Result_Sheet_Stamped    4062 non-null   object
 10  Result_Sheet_Corrected  4062 non-null   object
 11  Result_Sheet_Invalid    4062 non-null   object
 12  Result_Sheet_Unclear    4062 non-null   object
 13  Result_Sheet_Unsigned   4062 non-null   object
 14  APC                     4062 non-null   int64 
 15  LP  

In [8]:
load_dotenv()

api_key = os.getenv('OPENCAGE_API_KEY')

def get_coordinates(address):
    url = f'https://api.opencagedata.com/geocode/v1/json?q={address}&key={api_key}'
    response = requests.get(url).json()
    if response['results']:
        return response['results'][0]['geometry']['lat'], response['results'][0]['geometry']['lng']
    return None, None

combined_df['latitude'], combined_df['longitude'] = zip(*combined_df['PU-Name'].apply(get_coordinates))


KeyboardInterrupt: 

In [None]:
# Drop rows with missing coordinates
combined_df.dropna(subset=['latitude', 'longitude'], inplace=True)

In [None]:
def find_neighbours(polling_unit, all_units, radius=1):
    neighbours = []
    for _, unit in all_units.iterrows():
        distance = geodesic((polling_unit['latitude'], polling_unit['longitude']), 
                            (unit['latitude'], unit['longitude'])).kilometers
        if 0 < distance <= radius:
            neighbours.append(unit['polling_unit_id'])
    return neighbours

combined_df['neighbours'] = combined_df.apply(lambda row: find_neighbours(row, combined_df), axis=1)

In [None]:
def calculate_outlier_score(polling_unit, all_units, party):
    neighbours = all_units[all_units['polling_unit_id'].isin(polling_unit['neighbours'])]
    if not neighbours.empty:
        neighbour_votes = neighbours[party].mean()
        outlier_score = abs(polling_unit[party] - neighbour_votes)
        return outlier_score
    return None

for party in ['party1', 'party2', 'party3']:  # Replace with actual party columns
    combined_df[f'{party}_outlier_score'] = combined_df.apply(lambda row: calculate_outlier_score(row, combined_df, party), axis=1)

In [None]:
sorted_df = combined_df.sort_values(by=['party1_outlier_score', 'party2_outlier_score', 'party3_outlier_score'], ascending=False)
sorted_df.to_csv('sorted_polling_units.csv', index=False)

In [None]:
with open('outlier_report.txt', 'w') as report:
    report.write("Outlier Detection Report\n")
    report.write("=======================\n\n")
    report.write("Methodology:\n")
    report.write("1. Dataset Preparation: Loaded and combined datasets, added geospatial coordinates using OpenCage Geocoding API.\n")
    report.write("2. Neighbour Identification: Identified neighbours within a 1 km radius using geodesic distance.\n")
    report.write("3. Outlier Score Calculation: Calculated outlier scores for each party based on vote deviation from neighbours.\n")
    report.write("4. Sorting and Reporting: Sorted the dataset by outlier scores and identified top outliers.\n\n")

    report.write("Top 3 Outliers:\n")
    for i in range(3):
        outlier = sorted_df.iloc[i]
        report.write(f"Polling Unit: {outlier['polling_unit_id']}\n")
        report.write(f"Party 1 Outlier Score: {outlier['party1_outlier_score']}\n")
        report.write(f"Party 2 Outlier Score: {outlier['party2_outlier_score']}\n")
        report.write(f"Party 3 Outlier Score: {outlier['party3_outlier_score']}\n")
        report.write("Neighbours: {}\n".format(", ".join(outlier['neighbours'])))
        report.write("\n")