In [95]:
import pdfplumber
import pandas as pd
import re
import os

In [None]:
# For pdf files type 1
def extract_data_from_pdf(pdf_path):
    data = []
    year = re.findall(r'\d+', pdf_path)[0]  # Extract the year from the filename or path

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            state_pattern = re.compile(r'(.+?) Lok Sabha Result \d+')
            seat_pattern = re.compile(r'Total Seats:\s+\d+')
            party_performance_pattern = re.compile(r'([A-Za-z0-9\+\s]+)\s+\d+\s+\d+\s+([0-9\.]+)')

            state = re.search(state_pattern, text)

            # DEBUG:
            # print(f"{state}")

            if state:
                state_name = state.group(1).strip()
            else:
                state_name = 'Unknown'

            seat_match = re.search(seat_pattern, text)
            if seat_match:
                total_seats = int(re.findall(r'\d+', seat_match.group())[0])
            else:
                total_seats = 0

            party_data = []
            for line in text.split('\n'):
                match = party_performance_pattern.search(line)
                if match:
                    party_name = match.group(1).strip()
                    seats_won = int(re.findall(r'\d+', match.group())[1])
                    vote_percent = float(match.group(2).strip())
                    if vote_percent < 100.0:  # Ignore 100% vote percentages
                        party_data.append((party_name, seats_won, vote_percent))

            if party_data:
                winning_party = max(party_data, key=lambda x: x[2])
                data.append({
                    'Year': year,
                    'State': state_name,
                    'Winning Party': winning_party[0],
                    'Vote %': winning_party[2],
                    'Seats Won by Majority Party': winning_party[1],
                    'Total Seats': total_seats
                })

    return data

def save_data_to_csv(data, output_path):
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)

In [106]:
# For pdf files type 2

def extract_data_from_pdf(pdf_path):
    data = []
    with pdfplumber.open(pdf_path) as pdf:

        for page_num, page in enumerate(pdf.pages):

            # DEBUGGING
            # im = page.to_image()
            # im.draw_rects(page.extract_words())
            # im.show()
            # break

            table = page.extract_table(table_settings = 
                                       {"vertical_strategy" : "text",
                                        # "min_words_vertical": 4,
                                        "horizontal_strategy": "text"})
            # print(f"table is {table}")

            if table:
                data.extend(table)
    return data


def save_data_to_csv(data, csv_path):
    if data:
        df = pd.DataFrame(data)
        if not df.empty and len(df) > 1:
            df.columns = df.iloc[0]  # Set the first row as the header
            df = df[1:]
            df.dropna(how='all', inplace=True)  # Remove the header row from the data
            df.to_csv(csv_path, index=False)
        else:
            print(f"Warning: The data extracted from {csv_path} is empty or has insufficient rows.")
    else:
        print(f"Warning: No data was extracted from the PDF.")
    return df

In [107]:
cwd = os.getcwd()

input_directory = cwd + '/pdf_files_type2'  # Replace with your PDF folder path
output_directory = cwd + '/csv_files_type2'  # Replace with your output folder path

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for filename in os.listdir(input_directory):
    if filename.endswith('.pdf'):
        # debug
        print(f"{filename}")

        pdf_path = os.path.join(input_directory, filename)
        data = extract_data_from_pdf(pdf_path)
        year = re.findall(r'\d+', filename)[0]  # Extract the year from the filename
        output_filename = f"{year}_summary.csv"
        output_path = os.path.join(output_directory, output_filename)
        df = save_data_to_csv(data, output_path)
        print(f"Data for {filename} has been exported to {output_path}")

1962 (Vol I)-90-94.pdf
Data for 1962 (Vol I)-90-94.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files_type2/1962_summary.csv
1991 (Vol I)-94-107.pdf
Data for 1991 (Vol I)-94-107.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files_type2/1991_summary.csv
1967 (Vol I)-95-104.pdf
Data for 1967 (Vol I)-95-104.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files_type2/1967_summary.csv
1984 (Vol I)-97-103.pdf
Data for 1984 (Vol I)-97-103.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files_type2/1984_summary.csv
1996 (Vol I)-138-153.pdf
Data for 1996 (Vol I)-138-153.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files_type2/1996_summary.csv
1989 (Vol I)-119-129.pdf
Data for 1989 (Vol I)-119-129.pdf has been exported to /Users/nayeshagandotra/Desktop/india-election-stats/csv_files_type2/1989_summary.csv
1985 (Vol I)-20.pdf


In [103]:
# DEBUGGING
df = pd.read_csv(output_directory + "/1962_summary.csv")
df.dropna(how='all', inplace=True)

In [None]:
for filename in os.listdir(output_directory):
        if filename.endswith('.csv'):
            csv_path = os.path.join(output_directory, filename)
            df = pd.read_csv(csv_path)

            # Remove rows where 'State' is 'Unknown'
            df_cleaned = df[df['State'] != 'Unknown']

            # Save the cleaned DataFrame back to the same CSV file
            df_cleaned.to_csv(csv_path, index=False)

            # Print the final number of rows in the cleaned CSV file
            print(f"File: {filename}, Final number of rows: {len(df_cleaned)}")
