In [1]:
import pdfplumber
import pandas as pd
import re
import os

In [17]:
def extract_data_from_pdf(pdf_path):
    data = []
    year = re.findall(r'\d+', pdf_path)[0]  # Extract the year from the filename or path

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            state_pattern = re.compile(r'(.+?) Lok Sabha Result \d+')
            seat_pattern = re.compile(r'Total Seats:\s+\d+')
            party_performance_pattern = re.compile(r'([A-Za-z0-9\+\s]+)\s+\d+\s+\d+\s+([0-9\.]+)')

            state = re.search(state_pattern, text)

            # DEBUG:
            # print(f"{state}")

            if state:
                state_name = state.group(1).strip()
            else:
                state_name = 'Unknown'

            seat_match = re.search(seat_pattern, text)
            if seat_match:
                total_seats = int(re.findall(r'\d+', seat_match.group())[0])
            else:
                total_seats = 0

            party_data = []
            for line in text.split('\n'):
                match = party_performance_pattern.search(line)
                if match:
                    party_name = match.group(1).strip()
                    seats_won = int(re.findall(r'\d+', match.group())[1])
                    vote_percent = float(match.group(2).strip())
                    if vote_percent < 100.0:  # Ignore 100% vote percentages
                        party_data.append((party_name, seats_won, vote_percent))

            if party_data:
                winning_party = max(party_data, key=lambda x: x[2])
                data.append({
                    'Year': year,
                    'State': state_name,
                    'Winning Party': winning_party[0],
                    'Vote %': winning_party[2],
                    'Seats Won by Majority Party': winning_party[1],
                    'Total Seats': total_seats
                })

    return data

In [13]:
def save_data_to_csv(data, output_path):
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)

In [18]:
cwd = os.getcwd()

input_directory = cwd + '/pdf_files'  # Replace with your PDF folder path
output_directory = cwd + '/csv_files'  # Replace with your output folder path

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for filename in os.listdir(input_directory):
    if filename.endswith('.pdf'):
        # debug
        print(f"{filename}")

        pdf_path = os.path.join(input_directory, filename)
        data = extract_data_from_pdf(pdf_path)
        year = re.findall(r'\d+', filename)[0]  # Extract the year from the filename
        output_filename = f"{year}_summary.csv"
        output_path = os.path.join(output_directory, output_filename)
        save_data_to_csv(data, output_path)
        print(f"Data for {filename} has been exported to {output_path}")

2009_result_by_state.pdf
Data for 2009_result_by_state.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files/2009_summary.csv
2014_result_by_state.pdf
Data for 2014_result_by_state.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files/2014_summary.csv
2019_result_by_state.pdf
Data for 2019_result_by_state.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files/2019_summary.csv
1999_result_by_state.pdf
Data for 1999_result_by_state.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files/1999_summary.csv
2004_result_by_state.pdf
Data for 2004_result_by_state.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files/2004_summary.csv


In [19]:
for filename in os.listdir(output_directory):
        if filename.endswith('.csv'):
            csv_path = os.path.join(output_directory, filename)
            df = pd.read_csv(csv_path)

            # Remove rows where 'State' is 'Unknown'
            df_cleaned = df[df['State'] != 'Unknown']

            # Save the cleaned DataFrame back to the same CSV file
            df_cleaned.to_csv(csv_path, index=False)

            # Print the final number of rows in the cleaned CSV file
            print(f"File: {filename}, Final number of rows: {len(df_cleaned)}")


File: 2019_summary.csv, Final number of rows: 36
File: 2009_summary.csv, Final number of rows: 35
File: 1999_summary.csv, Final number of rows: 32
File: 2014_summary.csv, Final number of rows: 36
File: 2004_summary.csv, Final number of rows: 35
