In [12]:
import os

# Define the base project directory
project_dir = "election_processing"

# Create the base project directory
os.makedirs(project_dir, exist_ok=True)

# Create subdirectories
subdirectories = ["data", "notebooks", "scripts", "config"]
for subdir in subdirectories:
    os.makedirs(os.path.join(project_dir, subdir), exist_ok=True)

# Create subdirectories within data
data_subdirectories = ["raw", "bronze"]
for subdir in data_subdirectories:
    os.makedirs(os.path.join(project_dir, "data", subdir), exist_ok=True)

print(f"Project directory structure created under '{project_dir}'")

Project directory structure created under 'election_processing'


In [13]:
# Install required packages
!pip install pyspark findspark

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
from google.colab import files
import re



In [14]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Election Bronze Layer Processing") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .getOrCreate()

print("ðŸš€ Spark Session initialized for Bronze Layer Processing!")
spark.sparkContext.setLogLevel("WARN")
spark

ðŸš€ Spark Session initialized for Bronze Layer Processing!


In [15]:
def load_election_data(file_path):
  """
  Loads election data from a CSV file into a Spark DataFrame,
  cleans column names, adds metadata columns, and shows the first 5 rows.

  Args:
    file_path (str): The path to the CSV file.

  Returns:
    pyspark.sql.DataFrame: The loaded and processed Spark DataFrame.
  """
  try:
    # Read CSV
    bronze_df = spark.read.csv(file_path, header=True, inferSchema=True)

    # Clean column names
    for col in bronze_df.columns:
      clean_name = col.strip().replace(' ', '_').replace("%", "percent")
      bronze_df = bronze_df.withColumnRenamed(col, clean_name)

    # Add metadata columns
    bronze_df = bronze_df \
      .withColumn("IngestationTime", current_timestamp()) \
      .withColumn("File_name", lit(file_path.split('/')[-1])) \
      .withColumn("row_id", monotonically_increasing_id())

    print(f"Successfully loaded data from {file_path}")
    bronze_df.show(5)

    return bronze_df

  except Exception as e:
    print(f"Error loading data from {file_path}: {e}")
    return None

In [16]:
def save_bronze_data(df, output_path="/content/bronze_layer/election_results", filename=None):
  """
  Saves the bronze layer DataFrame to a CSV file.

  Args:
    df (pyspark.sql.DataFrame): The DataFrame to save.
    output_path (str): The directory where the CSV file will be saved.
    filename (str, optional): The desired filename for the CSV. If None,
                              Spark will generate a default filename.
  """
  if filename:
      # Construct the full path including the filename
      full_output_path = os.path.join(output_path, filename)
  else:
      full_output_path = output_path

  df.write\
  .mode("overwrite").option("header","true")\
  .csv(full_output_path)

  print(f"Bronze data saved to: {full_output_path}")

In [17]:
def bronze_data_quality_check(df):
  """ quality Check for Bronze layer """
  print("=== Bronze Layer Data Quality Report ===")

  print("\n=== Total Records ===")
  total_records = df.count()
  print(f"Total Records: {total_records}")

  print("\n=== Missing Values ===")
  from pyspark.sql.functions import col  # Use an alias for the col function
  for col_name in df.columns:
    missing_count = df.filter(col(col_name).isNull() | (col(col_name) == "")).count()
    print(f"Column '{col_name}': {missing_count} missing values")

  print("\n=== Duplicate Values ===")
  # Assuming 'ID' is a unique identifier column
  duplicate_ids_count = df.groupBy("ID").count().filter(col("count") > 1).count()
  print(f"Number of duplicate IDs: {duplicate_ids_count}")

  print("\n=== Schema and Data Types ===")
  df.printSchema()

  print("\n=== Basic Statistics ===")
  df.describe().show()

  print("\n=== Value Distribution for Categorical Columns (Top 10) ===")
  categorical_cols = [col for col in df.columns if df.schema[col].dataType in (StringType(),)]
  for col_name in categorical_cols:
      print(f"\nValue distribution for '{col_name}':")
      df.groupBy(col_name).count().orderBy(desc("count")).show(10, truncate=False)

  print("\n=== Data Quality Check Complete ===")

In [18]:
def process_bronze_layer(file_path="/content/GE_2024_Results.csv"):
  """
  Orchestrates the bronze layer processing for election data.

  Args:
    file_path (str): The path to the raw data CSV file.
  """
  # Load the data
  bronze_df = load_election_data(file_path)

  # Perform data quality checks
  if bronze_df:
    #bronze_data_quality_check(bronze_df)

    # Save the bronze data
    save_bronze_data(bronze_df)



In [23]:
# Define the file path
file_path = "/content/GE_2024_Results.csv"

# Load the data
bronze_df = load_election_data(file_path)

# Perform data quality checks
if bronze_df:
  bronze_data_quality_check(bronze_df)

  # Save the bronze data
  save_bronze_data(bronze_df)

Successfully loaded data from /content/GE_2024_Results.csv
+---+--------------------+--------------------+------------------+--------------------+---------+------------+-----------+----------------+------+--------------------+-------------------+------+
| ID|               State|        Constituency|         Candidate|               Party|EVM_Votes|Postal_Votes|Total_Votes|percent_of_Votes|Result|     IngestationTime|          File_name|row_id|
+---+--------------------+--------------------+------------------+--------------------+---------+------------+-----------+----------------+------+--------------------+-------------------+------+
|  1|Andaman & Nicobar...|Andaman & Nicobar...|   BISHNU PADA RAY|Bharatiya Janata ...|   102182|         254|     102436|           50.58|   Won|2025-08-24 13:10:...|GE_2024_Results.csv|     0|
|  2|Andaman & Nicobar...|Andaman & Nicobar...|KULDEEP RAI SHARMA|Indian National C...|    77829|         211|      78040|           38.54|  Lost|2025-08-24 13:1

In [20]:
# Call the function to process the bronze layer
process_bronze_layer(file_path="/content/GE_2024_Results.csv")

Successfully loaded data from /content/GE_2024_Results.csv
+---+--------------------+--------------------+------------------+--------------------+---------+------------+-----------+----------------+------+--------------------+-------------------+------+
| ID|               State|        Constituency|         Candidate|               Party|EVM_Votes|Postal_Votes|Total_Votes|percent_of_Votes|Result|     IngestationTime|          File_name|row_id|
+---+--------------------+--------------------+------------------+--------------------+---------+------------+-----------+----------------+------+--------------------+-------------------+------+
|  1|Andaman & Nicobar...|Andaman & Nicobar...|   BISHNU PADA RAY|Bharatiya Janata ...|   102182|         254|     102436|           50.58|   Won|2025-08-24 13:05:...|GE_2024_Results.csv|     0|
|  2|Andaman & Nicobar...|Andaman & Nicobar...|KULDEEP RAI SHARMA|Indian National C...|    77829|         211|      78040|           38.54|  Lost|2025-08-24 13:0

**Reasoning**:
Create the necessary directories for the project structure.

