# Record Compilation

In [1]:
import os
from pathlib import Path
import utilities as util
import pandas as pd

In [2]:
from numpy import int64


def define_dtypes(df):
    df['Month'] = df['Month'].astype(str)
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce').astype(int)
    df['Quarter'] = df['Quarter'].astype(str)
    df['District'] = df['District'].astype(str)
    df['Street'] = df['Street'].astype(str)
    df['Control Number'] = pd.to_numeric(df['Control Number'], errors='coerce').astype('Int64')
    df['Account Name'] = df['Account Name'].astype(str)
    df['Connection Type'] = df['Connection Type'].astype(str)
    df['Service Address'] = df['Service Address'].astype(str)
    df['Previous Reading'] = df['Previous Reading'].astype(str)
    df['Present Reading'] = df['Present Reading'].astype(str)
    df['Cleaned Previous Reading'] = pd.to_numeric(df['Cleaned Previous Reading'], errors='coerce').astype('Int64')
    df['Cleaned Present Reading'] = pd.to_numeric(df['Cleaned Present Reading'], errors='coerce').astype('Int64')
    df['Cleaned Consumption'] = pd.to_numeric(df['Cleaned Consumption'], errors='coerce').astype('Int64')
    df['Record Status'] = df['Record Status'].astype(str)
    df['Connection Status'] = df['Connection Status'].astype(str)
    return df

base_dir_path = Path("../../dataset/clean/pre_production")
years = os.listdir(base_dir_path)

output_dir_path = Path("../../dataset/clean/production/v1")
output_dir_path.mkdir(parents=True, exist_ok=True)

final_columns = [
    "Month",
    "Year",
    "Quarter",
    "District",
    "Street",
    "Control Number",
    "Connection Type",
    "Service Address",
    "Previous Reading",
    "Present Reading",
    "Cleaned Previous Reading",
    "Cleaned Present Reading",
    "Cleaned Consumption",
    "Record Status",
    "Connection Status",
]

consolidated_df = pd.DataFrame()

for year in years:
    year_path = base_dir_path / year
    files = [f for f in os.listdir(year_path) if f.endswith(".csv") and not f.startswith("cleaned_")]
    
    for file in files:
        file_path = year_path / file
        df = util.read_csv(file_path)
        if df is not None:
            try:
                consolidated_df = pd.concat([consolidated_df, df], ignore_index=True)
                print(f"✅ Successfully concatenated {file}")
            except Exception as e:
                print(f"Error concatenating {file}: {e}")

try:
    consolidated_df = define_dtypes(consolidated_df)
    util.save_csv(consolidated_df, output_dir_path / "consolidated.csv")
except Exception as e:
    print(f"Error saving consolidated file: {e}")

print(f"✅ Successfully saved consolidated file to {output_dir_path / "consolidated.csv"}")



✅ Successfully concatenated DEC_2019_pre_production.csv
✅ Successfully concatenated APR_2020_pre_production.csv
✅ Successfully concatenated AUG_2020_pre_production.csv
✅ Successfully concatenated DEC_2020_pre_production.csv
✅ Successfully concatenated FEB_2020_pre_production.csv
✅ Successfully concatenated JAN_2020_pre_production.csv
✅ Successfully concatenated JUL_2020_pre_production.csv
✅ Successfully concatenated JUN_2020_pre_production.csv
✅ Successfully concatenated MAR_2020_pre_production.csv
✅ Successfully concatenated MAY_2020_pre_production.csv
✅ Successfully concatenated NOV_2020_pre_production.csv
✅ Successfully concatenated OCT_2020_pre_production.csv
✅ Successfully concatenated SEP_2020_pre_production.csv
✅ Successfully concatenated APR_2022_pre_production.csv
✅ Successfully concatenated AUG_2022_pre_production.csv
✅ Successfully concatenated DEC_2022_pre_production.csv
✅ Successfully concatenated FEB_2022_pre_production.csv
✅ Successfully concatenated JAN_2022_pre_product

# Record Visualization

In [3]:
import utilities as utils

version = "v1"
consolidated_df = utils.read_csv(f"../../dataset/clean/production/{version}/consolidated.csv")
consolidated_df.head()
consolidated_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105040 entries, 0 to 105039
Data columns (total 16 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Month                     105040 non-null  object 
 1   Year                      105040 non-null  int64  
 2   Quarter                   105040 non-null  object 
 3   District                  91601 non-null   object 
 4   Street                    105040 non-null  object 
 5   Control Number            104988 non-null  float64
 6   Account Name              105038 non-null  object 
 7   Connection Type           104884 non-null  object 
 8   Service Address           105040 non-null  object 
 9   Previous Reading          83155 non-null   object 
 10  Present Reading           76709 non-null   object 
 11  Cleaned Previous Reading  84191 non-null   float64
 12  Cleaned Present Reading   84191 non-null   float64
 13  Cleaned Consumption       105040 non-null  i

  df = pd.read_csv(file_path, encoding=encoding)


In [4]:
import matplotlib.pyplot as plt
import seaborn as sns

# --- Ensure numeric Cleaned Present Reading ---
consumer_record['Cleaned Present Reading'] = pd.to_numeric(
    consumer_record['Cleaned Present Reading'], errors='coerce'
)

# --- Month Order ---
month_order = ["JAN", "FEB", "MAR", "APR", "MAY", "JUN",
               "JUL", "AUG", "SEP", "OCT", "NOV", "DEC"]

# --- Group by Year + Month ---
monthly_data = (
    consumer_record.groupby(['Year', 'Month'], sort=False)['Cleaned Present Reading']
    .sum()
    .reset_index()
)

# --- Ensure Month ordering ---
monthly_data['Month'] = pd.Categorical(
    monthly_data['Month'], categories=month_order, ordered=True
)

monthly_data = monthly_data.sort_values(['Year', 'Month'])

# --- Unique Years ---
years = monthly_data['Year'].unique()

# --- Plot: One Graph per Year ---
for year in years:
    yearly_data = monthly_data[monthly_data['Year'] == year]
    
    plt.figure(figsize=(10, 5))
    sns.lineplot(
        x='Month', y='Cleaned Present Reading', 
        marker='o', data=yearly_data
    )
    
    plt.title(f"Consumer 500750 - Cleaned Present Reading ({year})")
    plt.ylabel("Cleaned Present Reading")
    plt.xlabel("Month")
    plt.xticks(month_order)  # ensure consistent x-axis
    plt.tight_layout()
    plt.show()


NameError: name 'consumer_record' is not defined