In [75]:
# imports and libraries
import pdfplumber
import pandas as pd
import re
import os
import numpy as np

In [None]:
# For pdf files type 1
def extract_data_from_pdf(pdf_path):
    data = []
    year = re.findall(r'\d+', pdf_path)[0]  # Extract the year from the filename or path

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            state_pattern = re.compile(r'(.+?) Lok Sabha Result \d+')
            seat_pattern = re.compile(r'Total Seats:\s+\d+')
            party_performance_pattern = re.compile(r'([A-Za-z0-9\+\s]+)\s+\d+\s+\d+\s+([0-9\.]+)')

            state = re.search(state_pattern, text)

            # DEBUG:
            # print(f"{state}")

            if state:
                state_name = state.group(1).strip()
            else:
                state_name = 'Unknown'

            seat_match = re.search(seat_pattern, text)
            if seat_match:
                total_seats = int(re.findall(r'\d+', seat_match.group())[0])
            else:
                total_seats = 0

            party_data = []
            for line in text.split('\n'):
                match = party_performance_pattern.search(line)
                if match:
                    party_name = match.group(1).strip()
                    seats_won = int(re.findall(r'\d+', match.group())[1])
                    vote_percent = float(match.group(2).strip())
                    if vote_percent < 100.0:  # Ignore 100% vote percentages
                        party_data.append((party_name, seats_won, vote_percent))

            if party_data:
                winning_party = max(party_data, key=lambda x: x[2])
                data.append({
                    'Year': year,
                    'State': state_name,
                    'Winning Party': winning_party[0],
                    'Vote %': winning_party[2],
                    'Seats Won by Majority Party': winning_party[1],
                    'Total Seats': total_seats
                })

    return data

def save_data_to_csv(data, output_path):
    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)

In [None]:
# For pdf files type 2

def extract_data_from_pdf(pdf_path):
    data = []
    with pdfplumber.open(pdf_path) as pdf:

        for page_num, page in enumerate(pdf.pages):

            # DEBUGGING
            # im = page.to_image()
            # im.draw_rects(page.extract_words())
            # im.show()
            # break

            table = page.extract_table(table_settings = 
                                       {"vertical_strategy" : "text",
                                        # "min_words_vertical": 4,
                                        "horizontal_strategy": "text"})
            # print(f"table is {table}")

            if table:
                data.extend(table)
    return data


def save_data_to_csv(data, csv_path):
    if data:
        df = pd.DataFrame(data)
        if not df.empty and len(df) > 1:
            df.columns = df.iloc[0]  # Set the first row as the header
            df = df[1:]
            df.dropna(how='all', inplace=True)  # Remove the header row from the data
            df.to_csv(csv_path, index=False)
        else:
            print(f"Warning: The data extracted from {csv_path} is empty or has insufficient rows.")
    else:
        print(f"Warning: No data was extracted from the PDF.")
    return df

In [76]:
# Common execution: path definitions
cwd = os.getcwd()

input_directory = cwd + '/pdf_files_type2'  # Replace with your PDF folder path
output_directory = cwd + '/csv_files_type2'  # Replace with your output folder path


In [None]:
# Common execution: run this after pdf reader is defined
if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for filename in os.listdir(input_directory):
    if filename.endswith('.pdf'):
        # debug
        print(f"{filename}")

        pdf_path = os.path.join(input_directory, filename)
        data = extract_data_from_pdf(pdf_path)
        year = re.findall(r'\d+', filename)[0]  # Extract the year from the filename
        output_filename = f"{year}_summary.csv"
        output_path = os.path.join(output_directory, output_filename)
        df = save_data_to_csv(data, output_path)
        print(f"Data for {filename} has been exported to {output_path}")

In [None]:
# DEBUGGING
df = pd.read_csv(output_directory + "/1962_summary.csv")
df.dropna(how='all', inplace=True)

In [None]:
# Cleanup type 1
for filename in os.listdir(output_directory):
        if filename.endswith('.csv'):
            csv_path = os.path.join(output_directory, filename)
            df = pd.read_csv(csv_path)

            # Remove rows where 'State' is 'Unknown'
            df_cleaned = df[df['State'] != 'Unknown']

            # Save the cleaned DataFrame back to the same CSV file
            df_cleaned.to_csv(csv_path, index=False)

            # Print the final number of rows in the cleaned CSV file
            print(f"File: {filename}, Final number of rows: {len(df_cleaned)}")


In [96]:
# Cleanup type 2 
for filename in os.listdir(output_directory):
        if filename.endswith('.csv'):
                df_final = pd.DataFrame()
                csv_path = os.path.join(output_directory, filename)
                df = pd.read_csv(csv_path)

                # remove all Nan cells
                df.dropna(how='all', inplace=True)

                # Rename column names to 1,2,3,4...
                df.columns = list(range(1, len(df.columns) + 1))

                """finding state names"""
                # Create a mask for cells containing a percentage sign
                percentage_mask = df.apply(lambda x: x.str.contains(r'\d+%') if x.dtype == 'object' else False)

                # Shift the DataFrame to check for NaN values in the row above
                nan_mask = df.shift(1).isna()

                # Combine the masks
                combined_mask = (percentage_mask & nan_mask)

                # Get the indices of the rows right above where the condition is met
                indices_above_with_nan = combined_mask.any(axis=1)

                # To get the rows above, we need to shift the indices by 1 down
                indices_above_with_nan_shifted = indices_above_with_nan.shift(-1).fillna(False)
                true_indices = indices_above_with_nan_shifted[indices_above_with_nan_shifted].index.tolist()

                # Get the rows of the DataFrame corresponding to result_indices
                df_filtered = df.loc[true_indices]

                # get state names by summing up the relevant columns
                df_filtered['col_3_sum'] = df_filtered.iloc[:, :4].apply(lambda row: ''.join([x for x in row if isinstance(x, str) and pd.notna(x) and x != 'NaN']), axis=1)
                
                # save state names in the final df (to covert to csv)
                df_final["State Name"] = df_filtered[['col_3_sum']]
                """finding state names"""

                """finding winning party"""
                # Iterate over the indices and process sections of the DataFrame
                sections = [df.iloc[start:end] for start, end in zip(true_indices, true_indices[1:])] + [df.iloc[true_indices[-1]:]]    

                # List to collect new columns
                highest_percentages = []
                winning_parties = []

                for i, section in enumerate(sections):

                        # Find the vote % column
                        percent_column = next((col for col in section.columns if section[col].astype(str).str.contains(r'\d+%').any()), None)

                        if percent_column is None:
                                highest_percentages.append(0)
                                winning_parties.append(0)
                                print(f" none {i}")
                                continue

                        # Filter the column to keep only valid percentage rows
                        section = section[section[percent_column].astype(str).str.contains(r'\d+%')]
                        section[percent_column] = section[percent_column].str.rstrip('%').astype(float)

                        # Find the row with the highest percentage
                        # print(f"{section[percent_column].argmax()}")
                        highest_percentages.append(section[percent_column].max())
                        
                        # Find the winning party
                        max_percent_row = section.loc[section[percent_column].idxmax()]
                        winning_party = next((value for value in max_percent_row if isinstance(value, str)), None)
                        winning_parties.append(winning_party)
                        
                        

                # Convert highest_percentages list to a Series and reindex to match df_final length

                # Add the highest percentages as a new column to df_final
                df_final['Winning party'] = winning_parties
                df_final['vote %'] = highest_percentages
                
                # Save df_final to a CSV file
                csv_file_path = output_directory + f"/csv_files_type2_filtered/{filename}"
                df_final.to_csv(csv_file_path, index=False)

                """finding winning party"""

                




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 13
 none 14
 none 15
 none 16
 none 17
 none 18
 none 19
 none 20
 none 21
 none 22
 none 23
 none 24
 none 25
 none 26
 none 27
 none 28


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 10
 none 11
 none 12
 none 13
 none 14
 none 15
 none 16
 none 17


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 15
 none 16
 none 17
 none 18
 none 19
 none 20
 none 21
 none 22
 none 23
 none 24
 none 25
 none 26
 none 27
 none 28
 none 29
 none 30
 none 31
 none 32


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 20
 none 21
 none 22
 none 23
 none 24
 none 25
 none 26
 none 27
 none 28
 none 29
 none 30
 none 31
 none 32
 none 33
 none 34
 none 35
 none 36
 none 37
 none 38
 none 39
 none 40


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 22
 none 25
 none 26
 none 27
 none 28
 none 29
 none 30
 none 31
 none 32
 none 33
 none 34
 none 35
 none 36
 none 37
 none 38
 none 39
 none 40
 none 41
 none 42
 none 43


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 15
 none 16
 none 17
 none 18
 none 19
 none 20
 none 21
 none 22
 none 23
 none 24
 none 25
 none 26
 none 27
 none 28
 none 29
 none 30


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 11
 none 12
 none 13
 none 14
 none 15
 none 16
 none 17
 none 18
 none 19
 none 20
 none 21


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 15
 none 16
 none 17
 none 18
 none 19
 none 20
 none 21
 none 22
 none 23
 none 24
 none 25
 none 26
 none 27
 none 28
 none 29
 none 30
 none 31


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 16
 none 17
 none 18
 none 19
 none 20
 none 21
 none 22
 none 23
 none 24
 none 25
 none 26
 none 27
 none 28
 none 29
 none 30
 none 31
 none 32
 none 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 20
 none 21
 none 22
 none 23
 none 24
 none 25
 none 26
 none 27
 none 28
 none 29
 none 30
 none 31
 none 32
 none 33
 none 34
 none 35


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  section[percent_column] = section[percent_column].str.rstrip('%').astype(float)


 none 22
 none 23
 none 24
 none 25
 none 26
 none 27
 none 28
 none 29
 none 30
 none 31
 none 32
 none 33
 none 34
 none 35
 none 36
 none 37
 none 38
 none 39
