## Data Assessment of Waterflow Historical Data

**Metadata Summary**  
- üìÖ **Date of Retrieval:** JULY 1, 2025  
- üåê **Source of Data:** LGU San Jacinto Treasury Records
- üìÑ **License/Permission:**  
- üßë‚Äçüíº **Prepared by:** MARK JUNE E. ALMOJUELA

This notebook is used to split the compiled records with more than one month in one file to create chunks of records for each month.

In [76]:
# Initialization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os 

# Split MAR_APR2020 record to create MAR2020 and APR2020

In [77]:
# Initialize df as None at the start
df = None

# Define the file path
file_path = os.path.normpath("../../dataset/raw/2020/MAR_APR2020.csv")

# Print the full path for verification
print(f"Attempting to load file from: {os.path.abspath(file_path)}")

try:
    if not os.path.exists(file_path):
        print("Error: File not found at the specified location.")
        dir_path = os.path.dirname(file_path)
        if not os.path.exists(dir_path):
            print(f"Error: Directory not found: {os.path.abspath(dir_path)}")
        else:
            print("Files in directory:")
            print(os.listdir(dir_path))
    else:
        # Try UTF-8 encoding first
        try:
            df = pd.read_csv(file_path)
            print("File loaded successfully with UTF-8 encoding!")
        except UnicodeDecodeError:
            print("Trying with 'latin1' encoding...")
            df = pd.read_csv(file_path, encoding='latin1')
            print("File loaded successfully with 'latin1' encoding!")
        
        # Display info if df was loaded
        if df is not None:
            print(f"\nNumber of rows: {len(df)}")
            print("\nFirst few rows:")
            print(df.head())
            print("\nColumns in the dataset:")
            print(df.columns.tolist())
            
except Exception as e:
    print(f"An error occurred: {e}")

# The df variable is now available for use in subsequent cells

Attempting to load file from: c:\Users\Mark June Almojuela\OneDrive - Bicol University\WaterFlow\AI\Model Training\dataset\raw\2020\MAR_APR2020.csv
Trying with 'latin1' encoding...
File loaded successfully with 'latin1' encoding!

Number of rows: 1633

First few rows:
   Control Number      Consumer's Name       Address Water Meter Serial #  \
0        501549.0       Alba√±o, Lilane  Alicante St.                  NaN   
1        500750.0  Aljecera, Marcelino  Alicante St.                  NaN   
2        500990.0       Almi√±ana, Irus  Alicante St.                  NaN   
3        500505.0       Almi√±e, Edison  Alicante St.             95022096   
4        501542.0       Almi√±e, Filben  Alicante St.                  NaN   

  Previous Present  Cons.    Amount  
0      218     247   29.0    87.00   
1     3030    3051   21.0    63.00   
2      471     537   66.0   198.00   
3        2      63   61.0   183.00   
4     3271    3314   43.0   129.00   

Columns in the dataset:
['Control N

In [78]:
# Count of null/NaN values in each column
null_counts = df.isnull().sum()
print("Count of null/NaN values per column:")
print(null_counts[null_counts > 0])  # Only show columns with null values

# Count of rows with any null/NaN values
rows_with_nulls = df[df.isnull().any(axis=1)]
print(f"\nNumber of rows with any null/NaN values: {len(rows_with_nulls)}")

Count of null/NaN values per column:
Control Number            1
Water Meter Serial #    698
Previous                201
Present                 403
Cons.                   548
Amount                  416
dtype: int64

Number of rows with any null/NaN values: 1026


Creating MAR2020 AND APR2020 records

In [68]:
# Logic test for MAR_APR2020.csv record split
for index, row in df.iterrows():
    try:
        control_number = row["Control Number"]
        consumer_name = row["Consumer's Name"]
        address = row["Address"]
        serial_number = row["Water Meter Serial #"]
        try:
            previous_reading = int(row["Previous"])
        except ValueError:
            previous_reading = 0
        
        try:
            present_reading = int(row["Present"])
        except ValueError:
            if previous_reading > 0:
                present_reading = previous_reading
            else:
                present_reading = 0
        
        current_reading = present_reading - ((present_reading - previous_reading) / 2)
        
        total_consumption = present_reading - previous_reading
        total_amount = total_consumption * 10

        print(control_number, consumer_name, address, serial_number, 
              previous_reading, current_reading, total_consumption, total_amount)
              
    except Exception as e:
        print(f"Error processing row {index}: {e}")

501549.0 Alba√±o, Lilane Alicante St. nan 218 232.5 29 290
500750.0 Aljecera, Marcelino Alicante St. nan 3030 3040.5 21 210
500990.0 Almi√±ana, Irus Alicante St. nan 471 504.0 66 660
500505.0 Almi√±e, Edison Alicante St. 95022096 2 32.5 61 610
501542.0 Almi√±e, Filben Alicante St. nan 3271 3292.5 43 430
500431.0 Almi√±e, Franchie Alicante St. 121006093 0 0.0 0 0
500263.0 Almodal, Arna Alicante St. 9588526 5228 5240.5 25 250
501240.0 Almocera, Owen Alicante St. nan 67 102.5 71 710
500484.0 Almodal, Erlinda Alicante St. 028086-02 0 0.0 0 0
500739.0 Almodal, Jolly Alicante St. 017902-02 1795 1861.5 133 1330
500544.0 Almodal, Noe Alicante St. nan 2418 2418.0 0 0
500187.0 Almodiel, Arles Alicante St. 9074313 3210 3210.0 0 0
501447.0 Almodiel, Mary Grace Alicante St. nan 238 240.5 5 50
501453.0 Alcantara, Hilda Alicante St. nan 183 189.5 13 130
501317.0 Almoete, Ike Alicante St. nan 595 603.0 16 160
501280.0 Almojuela, Arlic Alicante St. nan 424 448.0 48 480
500248.0 Almojuela, Rogelio Alica

In [89]:
import csv

# Create the output directory if it doesn't exist
mar_output_dir = os.path.dirname("../../dataset/raw/2020/MAR2020.csv")
apr_output_dir = os.path.dirname("../../dataset/raw/2020/APR2020.csv")
os.makedirs(mar_output_dir, exist_ok=True)
os.makedirs(apr_output_dir, exist_ok=True)

with open("../../dataset/raw/2020/MAR2020.csv", "w", newline="", encoding='latin-1') as mar_file \
    , open("../../dataset/raw/2020/APR2020.csv", "w", newline="", encoding='latin-1') as apr_file:
    mar_csv_writer = csv.writer(mar_file)
    apr_csv_writer = csv.writer(apr_file)
    # Write header
    mar_csv_writer.writerow([
        "Control Number", "Consumer's Name", "Address", 
        "Water Meter Serial #", "Previous", "Present", 
        "Cons.", "Amount", "Connection Status"
    ])
    apr_csv_writer.writerow([
        "Control Number", "Consumer's Name", "Address", 
        "Water Meter Serial #", "Previous", "Present", 
        "Cons.", "Amount", "Connection Status"
    ])

    for index, row in df.iterrows():
        try:
            control_number = row["Control Number"]
            consumer_name = row["Consumer's Name"]
            address = row["Address"]
            serial_number = row["Water Meter Serial #"]
            connection_status = None
            
            # Handle Previous Reading
            try:
                mar_previous_reading = int(float(str(row["Previous"]).strip()))
                connection_status = "Connected"
            except (ValueError, TypeError):
                if row['Previous'] is not None:
                    prev_status = str(row['Previous']).strip().upper() if pd.notna(row['Previous']) else ""
                    if prev_status in ["DISC", "DISC."]:
                        connection_status = "Disconnected"
                    elif prev_status:   
                        connection_status = prev_status.capitalize()
                else:
                    mar_previous_reading = 0
                    connection_status = "Unknown"
            
            # Handle Present Reading
            try:
                mar_present_reading = int(float(str(row["Present"]).strip()))
                connection_status = "Connected" if connection_status is None else connection_status
            except (ValueError, TypeError):
                mar_present_reading = mar_previous_reading if mar_previous_reading is not None else 0
                connection_status = "Unknown" if connection_status is None else connection_status
            
            # Calculate values for March
            mar_current_reading = mar_previous_reading + round((mar_present_reading - mar_previous_reading) / 2)
            mar_total_consumption = mar_current_reading - mar_previous_reading
            mar_total_amount = mar_total_consumption * 10 

            new_record_mar = [
                control_number, consumer_name, address, serial_number,
                mar_previous_reading, round(mar_current_reading),
                mar_total_consumption, mar_total_amount, connection_status
            ]

            # Calculate values April
            apr_previous_reading = mar_current_reading
            
            # Handle Present Reading
            try:
                apr_current_reading = int(float(str(row["Present"]).strip()))
            except (ValueError, TypeError):
                apr_current_reading = apr_previous_reading if apr_previous_reading is not None else 0
            
            # Calculate values for April
            apr_total_consumption = apr_current_reading - apr_previous_reading
            apr_total_amount = apr_total_consumption * 10 

            new_record_apr = [
                control_number, consumer_name, address, serial_number,
                apr_previous_reading, round(apr_current_reading),
                apr_total_consumption, apr_total_amount, connection_status
            ]            
            # Print Record
            print(f"Processed MAR {index} rows: {new_record_mar}")
            print(f"Processed APR {index} rows: {new_record_apr}")
            
            # Write row
            mar_csv_writer.writerow(new_record_mar)
            apr_csv_writer.writerow(new_record_apr) 

            # Reset connection status
            connection_status = None
               
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            continue

print("Processing complete!")

Processed MAR 0 rows: [501549.0, 'Alba√±o, Lilane', 'Alicante St.', nan, 218, 232, 14, 140, 'Connected']
Processed APR 0 rows: [501549.0, 'Alba√±o, Lilane', 'Alicante St.', nan, 232, 247, 15, 150, 'Connected']
Processed MAR 1 rows: [500750.0, 'Aljecera, Marcelino', 'Alicante St.', nan, 3030, 3040, 10, 100, 'Connected']
Processed APR 1 rows: [500750.0, 'Aljecera, Marcelino', 'Alicante St.', nan, 3040, 3051, 11, 110, 'Connected']
Processed MAR 2 rows: [500990.0, 'Almi√±ana, Irus', 'Alicante St.', nan, 471, 504, 33, 330, 'Connected']
Processed APR 2 rows: [500990.0, 'Almi√±ana, Irus', 'Alicante St.', nan, 504, 537, 33, 330, 'Connected']
Processed MAR 3 rows: [500505.0, 'Almi√±e, Edison', 'Alicante St.', '95022096', 2, 32, 30, 300, 'Connected']
Processed APR 3 rows: [500505.0, 'Almi√±e, Edison', 'Alicante St.', '95022096', 32, 63, 31, 310, 'Connected']
Processed MAR 4 rows: [501542.0, 'Almi√±e, Filben', 'Alicante St.', nan, 3271, 3293, 22, 220, 'Connected']
Processed APR 4 rows: [501542.0,

In [71]:
# Read the data with optimized dtypes
dtypes = {
    'Control Number': 'str',
    "Consumer's Name": 'str',
    'Address': 'str',
    'Water Meter Serial #': 'str',
    'Previous': 'float64',
    'Present': 'float64',
    'Current': 'float64',
    'Cons.': 'float64',
    'Amount': 'float64'
}

# Read the CSV
new_df = pd.read_csv("../../dataset/raw/2020/APR2020.csv", 
                    encoding='latin-1',
                    dtype=dtypes)

# Check for negative consumption
print("=== Negative Consumption Summary ===")
neg_consumption = new_df[new_df['Cons.'] < 0]
print(f"Total rows with negative consumption: {len(neg_consumption)}")
if not neg_consumption.empty:
    print("\nSample of rows with negative consumption:")
    print(neg_consumption[['Control Number', 'Previous', 'Present', 'Cons.']].head())

# Check for negative amount
print("\n=== Negative Amount Summary ===")
neg_amount = new_df[new_df['Amount'] < 0]
print(f"Total rows with negative amount: {len(neg_amount)}")
if not neg_amount.empty:
    print("\nSample of rows with negative amount:")
    print(neg_amount[['Control Number', 'Cons.', 'Amount']].head())

# Additional checks
print("\n=== Additional Data Quality Checks ===")
print(f"Total rows: {len(new_df)}")
print(f"Rows with zero consumption: {len(new_df[new_df['Cons.'] == 0])}")
print(f"Rows with missing values: {new_df.isnull().any(axis=1).sum()}")

=== Negative Consumption Summary ===
Total rows with negative consumption: 1

Sample of rows with negative consumption:
    Control Number  Previous  Present  Cons.
171       500741.0      58.0     14.0  -44.0

=== Negative Amount Summary ===
Total rows with negative amount: 1

Sample of rows with negative amount:
    Control Number  Cons.  Amount
171       500741.0  -44.0  -440.0

=== Additional Data Quality Checks ===
Total rows: 1633
Rows with zero consumption: 544
Rows with missing values: 698


# Create records for AUG2022 and SEP2022

In [128]:
# Review of consumption difference
july_df = pd.read_csv("../../dataset/raw/2022/JUL2022.csv", encoding='latin-1')
oct_df = pd.read_csv("../../dataset/raw/2022/OCT2022.csv", encoding='latin-1')
nov_df = pd.read_csv("../../dataset/raw/2022/NOV2022.csv", encoding='latin-1')
dec_df = pd.read_csv("../../dataset/raw/2022/DEC2022.csv", encoding='latin-1')

july_df['Present'] = pd.to_numeric(july_df['Present'], errors='coerce')
oct_df['Present'] = pd.to_numeric(oct_df['Present'], errors='coerce')
nov_df['Present'] = pd.to_numeric(nov_df['Present'], errors='coerce')
dec_df['Present'] = pd.to_numeric(dec_df['Present'], errors='coerce')

july_mini_df = july_df[['Control Number', 'Consumer\'s Name', 'Previous', 'Present']]
oct_mini_df = oct_df[['Control Number', 'Consumer\'s Name', 'Previous', 'Present']]
nov_mini_df = nov_df[['Control Number', 'Consumer\'s Name', 'Previous', 'Present']]
dec_mini_df = dec_df[['Control Number', 'Consumer\'s Name', 'Previous', 'Present']] 

merged_df = pd.merge(july_mini_df, oct_mini_df, on=['Control Number', 'Consumer\'s Name'], suffixes=('_july', '_oct'))
merged_df = pd.merge(merged_df, nov_mini_df, on=['Control Number', 'Consumer\'s Name'], suffixes=('_oct', '_nov'))
merged_df = pd.merge(merged_df, dec_mini_df, on=['Control Number', 'Consumer\'s Name'], suffixes=('_nov', '_dec'))

merged_df['Consumption Difference'] = pd.to_numeric(merged_df['Present_oct']) - pd.to_numeric(merged_df['Present_july'])

negative_df = merged_df[merged_df['Consumption Difference'] < 0]
print("Negative Consumption Difference Count: ", len(negative_df))
display(negative_df.head())
display(negative_df)


Negative Consumption Difference Count:  14


Unnamed: 0,Control Number,Consumer's Name,Previous_july,Present_july,Previous_oct,Present_oct,Previous_nov,Present_nov,Previous_dec,Present_dec,Consumption Difference
802,500606,"Almoete, Oscar",2571,2571.0,2548.0,2555.0,2555,2601.0,2349,2374.0,-16.0
882,500375,"Mira, Noe",3514,3514.0,3512.0,3512.0,3512,3514.0,3512,3512.0,-2.0
930,500682,"Almosara, Celin",3009,3024.0,,43.0,43,,2881,2903.0,-2981.0
963,501355,"Bocboc, Evelyn",1000,1030.0,,6.0,6,18.0,966,968.0,-1024.0
964,500881,"Bocboc, Lily",1456,1478.0,,74.0,74,,DEFECT,,-1404.0


Unnamed: 0,Control Number,Consumer's Name,Previous_july,Present_july,Previous_oct,Present_oct,Previous_nov,Present_nov,Previous_dec,Present_dec,Consumption Difference
802,500606,"Almoete, Oscar",2571,2571.0,2548,2555.0,2555,2601.0,2349,2374.0,-16.0
882,500375,"Mira, Noe",3514,3514.0,3512,3512.0,3512,3514.0,3512,3512.0,-2.0
930,500682,"Almosara, Celin",3009,3024.0,,43.0,43,,2881,2903.0,-2981.0
963,501355,"Bocboc, Evelyn",1000,1030.0,,6.0,6,18.0,966,968.0,-1024.0
964,500881,"Bocboc, Lily",1456,1478.0,,74.0,74,,DEFECT,,-1404.0
978,501592,"Dejino, Evangeline",1818,1881.0,,55.0,55,72.0,DEFECT,,-1826.0
1021,501158,"Gupalao, Roger",609,622.0,,8.0,8,40.0,494,509.0,-614.0
1046,501688,"Pinaranda, Maricel",347,350.0,,31.0,31,59.0,256,271.0,-319.0
1072,500066,"Almodal, Glenda",4755,4755.0,19,22.0,22,25.0,4743,4755.0,-4733.0
1221,500960,"Almojuela, Nila",973,973.0,26,35.0,35,48.0,DEFECT,973.0,-938.0


In [119]:
print(f'Null Present Readings Count: {july_mini_df['Present'].isnull().sum()}')
print(f'Null Previous Readings Count: {july_mini_df['Previous'].isnull().sum()}')

# Coerce to numeric and create masks for invalid entries
present_numeric = pd.to_numeric(july_mini_df["Present"], errors="coerce")
previous_numeric = pd.to_numeric(july_mini_df["Previous"], errors="coerce")

# Boolean masks where coercion failed (i.e., non-numeric values)
invalid_present_mask = present_numeric.isna() & july_mini_df["Present"].notna()
invalid_previous_mask = previous_numeric.isna() & july_mini_df["Previous"].notna()

# Extract invalid entries
invalid_present_values = july_mini_df.loc[invalid_present_mask, "Present"].unique()
invalid_previous_values = july_mini_df.loc[invalid_previous_mask, "Previous"].unique()

# Report results
print(f"Non-Numeric Present Readings Count: {invalid_present_mask.sum()}")
print(f"Values: {invalid_present_values.tolist()}")

print(f"Non-Numeric Previous Readings Count: {invalid_previous_mask.sum()}")
print(f"Values: {invalid_previous_values.tolist()}")

Null Present Readings Count: 539
Null Previous Readings Count: 58
Non-Numeric Present Readings Count: 0
Values: []
Non-Numeric Previous Readings Count: 478
Values: ['DISC.', 'DEFECT', 'TECH. DEFECT', 'TEMP. DISC.']


In [127]:
# Null row check for Present and Previous columns of july_mini_df
july_null_rows = july_mini_df[july_mini_df[['Present', 'Previous']].isnull().any(axis=1)]
print('Null row count: ', len(july_null_rows))
display(july_null_rows)

# Remove null rows from oct_mini_df
oct_null_rows = oct_mini_df[oct_mini_df[['Present', 'Previous']].isnull().any(axis=1)]
print('Null row count: ', len(oct_null_rows))
display(oct_null_rows)

Null row count:  542


Unnamed: 0,Control Number,Consumer's Name,Previous,Present
5,501542,"Almi√±e, Filben",DISC.,
6,500431,"Almi√±e, Franchie",DISC.,
10,500739,"Almodal, Jolly",DEFECT,
13,501447,"Almodiel, Mary Grace",DEFECT,
21,500260,"Altiche, Antonio",DEFECT,
...,...,...,...,...
1918,501112,"Espiloy, Nemia",DISC.,
1919,500919,"Granado, Nida",DISC.,
1924,500641,"Moya, Concepcion",DISC.,
1927,501109,"Almocera, Ricky",DISC.,


Null row count:  619


Unnamed: 0,Control Number,Consumer's Name,Previous,Present
4,501704,"Almi√±ana, Violeta",184,
6,501542,"Almi√±e, Filben",DISC.,
7,500431,"Almi√±e, Franchie",DISC.,
10,500484,"Almodal, Judem",1147,
11,500739,"Almodal, Jolly",DEFECT,
...,...,...,...,...
1969,501112,"Espiloy, Nemia",DISC.,
1970,500919,"Granado, Nida",DISC.,
1975,500641,"Moya, Concepcion",DISC.,
1978,501109,"Almocera, Ricky",DISC.,


In [183]:
import re 

jul_input_df = pd.read_csv("../../dataset/raw/2022/JUL2022.csv", encoding='latin-1')
oct_input_df = pd.read_csv("../../dataset/raw/2022/OCT2022.csv", encoding='latin-1')

jul_str_list = []
oct_str_list = []

def check_instance(x, month):
    if isinstance(x, str) and re.search(r"[a-zA-Z]", x):
        if month == 'JUL':
            jul_str_list.append(x)
        elif month == 'OCT':
            oct_str_list.append(x)
    else:   
        pass

jul_input_df["Present"].apply(lambda x: check_instance(x, 'JUL'))
oct_input_df["Present"].apply(lambda x: check_instance(x, 'OCT'))

jul_input_df["Previous"].apply(lambda x: check_instance(x, 'JUL'))
oct_input_df["Previous"].apply(lambda x: check_instance(x, 'OCT'))

print(set(jul_str_list))
print(set(oct_str_list))
print(set(jul_str_list).union(set(oct_str_list)))

{'DEFECT', 'TEMP. DISC.', 'DISC.', 'TECH. DEFECT'}
{'DEFECT', 'DISC.', 'TEMP. DISC.', 'TEMP. CLOSED'}
{'DEFECT', 'DISC.', 'TEMP. DISC.', 'TEMP. CLOSED', 'TECH. DEFECT'}


In [199]:
# DATASET PREPARATION
# Step 1: Read the input CSVs
jul_input_df = pd.read_csv("../../dataset/raw/2022/JUL2022.csv", encoding='latin-1')
oct_input_df = pd.read_csv("../../dataset/raw/2022/OCT2022.csv", encoding='latin-1')

# Step 2: Define the function to clean Present/Previous readings
def fill_reading(row, primary, secondary):
    if pd.isna(row[primary]) and pd.notna(row[secondary]):
        return row[secondary]
    elif pd.isna(row[primary]) and pd.isna(row[secondary]):
        return 0
    else:
        return row[primary]

def analyze_connection(row, primary, secondary):
    invalid_values = {'DEFECT': 'Defect', 'DISC.': 'Disconnected', 'TEMP. DISC.': 'Temporarily Disconnected', 'TEMP. CLOSED': 'Temporarily Closed', 'TECH. DEFECT': 'Technical Defect', '0': 'Invalid'}
    if pd.isna(row[primary]) and pd.notna(row[secondary]):
        if row[secondary] in invalid_values:
            return invalid_values[row[secondary]]
        elif pd.notna(row[f'Cleaned {primary}']):
            return "Resolved"
        else:
            return "Invalid"
    elif pd.notna(row[primary]) and pd.isna(row[secondary]):
        if pd.notna(row[f'Cleaned {primary}']):
            return "Resolved"
        else:
            return "Invalid"
    elif pd.isna(row[primary]) and pd.isna(row[secondary]):
        return "Invalid"
    else:
        return "Valid"

# Step 3: Apply row-wise filling
jul_input_df["Cleaned Present"] = jul_input_df.apply(lambda row: fill_reading(row, "Present", "Previous"), axis=1)
jul_input_df["Cleaned Previous"] = jul_input_df.apply(lambda row: fill_reading(row, "Previous", "Present"), axis=1)

oct_input_df["Cleaned Present"] = oct_input_df.apply(lambda row: fill_reading(row, "Present", "Previous"), axis=1)
oct_input_df["Cleaned Previous"] = oct_input_df.apply(lambda row: fill_reading(row, "Previous", "Present"), axis=1)

# Step 4: Add status columns
jul_input_df["Status"] = jul_input_df.apply(lambda row: analyze_connection(row, "Present", "Previous"), axis=1)
oct_input_df["Status"] = oct_input_df.apply(lambda row: analyze_connection(row, "Present", "Previous"), axis=1)

# Step 5: Display the cleaned columns
display(jul_input_df[["Present", "Previous", "Cleaned Present", "Cleaned Previous", "Status"]].head())
display(oct_input_df[["Present", "Previous", "Cleaned Present", "Cleaned Previous", "Status"]].head())

Unnamed: 0,Present,Previous,Cleaned Present,Cleaned Previous,Status
0,544,544,544,544,Valid
1,3274,3274,3274,3274,Valid
2,1401,1401,1401,1401,Valid
3,147,147,147,147,Valid
4,894,894,894,894,Valid


Unnamed: 0,Present,Previous,Cleaned Present,Cleaned Previous,Status
0,610.0,585,610,585,Valid
1,321.0,315,321,315,Valid
2,3359.0,3327,3359,3327,Valid
3,1631.0,1581,1631,1581,Valid
4,,184,184,184,Resolved


In [200]:
# Count Check
print("JUL2022 shape:", jul_input_df.shape)
print("OCT2022 shape:", oct_input_df.shape)

# Step 1: Check for duplicate records
print("Duplicate records in JUL2022:", jul_input_df.duplicated().sum())
print("Duplicate records in OCT2022:", oct_input_df.duplicated().sum())

merged_ref_df = pd.merge(
    jul_input_df,
    oct_input_df,
    on=["Control Number", "Consumer's Name", "Address", "Water Meter Serial #"],
    how="inner",
    suffixes=('_JUL', '_OCT')
)

# Final display with Status included
display(merged_ref_df.head())

JUL2022 shape: (1929, 11)
OCT2022 shape: (1980, 11)
Duplicate records in JUL2022: 0
Duplicate records in OCT2022: 0


Unnamed: 0,Control Number,Consumer's Name,Address,Water Meter Serial #,Previous_JUL,Present_JUL,Cons._JUL,Amount_JUL,Cleaned Present_JUL,Cleaned Previous_JUL,Status_JUL,Previous_OCT,Present_OCT,Cons._OCT,Amount_OCT,Cleaned Present_OCT,Cleaned Previous_OCT,Status_OCT
0,501549,"Alba√±o, Lilane",Alicante St.,,544,544,,60.0,544,544,Valid,585,610.0,25.0,150.0,610,585,Valid
1,500750,"Aljecera, Marcelino",Alicante St.,,3274,3274,,60.0,3274,3274,Valid,3327,3359.0,32.0,192.0,3359,3327,Valid
2,500990,"Almi√±ana, Irus",Alicante St.,,1401,1401,,60.0,1401,1401,Valid,1581,1631.0,50.0,300.0,1631,1581,Valid
3,501704,"Almi√±ana, Violeta",Alicante St.,,147,147,,,147,147,Valid,184,,,,184,184,Resolved
4,500505,"Almi√±e, Edison",Alicante St.,95022096.0,894,894,,60.0,894,894,Valid,983,1023.0,40.0,240.0,1023,983,Valid


In [None]:
# Final Check
print("Final shape:", merged_ref_df.shape)
print("Duplicate records:", merged_ref_df.duplicated().sum())
display(merged_ref_df["Status_JUL"].value_counts())
display(merged_ref_df["Status_OCT"].value_counts())


Final shape: (1879, 18)
Duplicate records: 0


Status_JUL
Valid       1815
Invalid       52
Resolved      12
Name: count, dtype: int64

Status_OCT
Valid           1779
Resolved          42
Invalid           39
Disconnected      19
Name: count, dtype: int64

In [204]:
display(merged_ref_df[merged_ref_df["Status_JUL"] == "Invalid"])

Unnamed: 0,Control Number,Consumer's Name,Address,Water Meter Serial #,Previous_JUL,Present_JUL,Cons._JUL,Amount_JUL,Cleaned Present_JUL,Cleaned Previous_JUL,Status_JUL,Previous_OCT,Present_OCT,Cons._OCT,Amount_OCT,Cleaned Present_OCT,Cleaned Previous_OCT,Status_OCT
74,500985,"Escorel, Potito",Alicante St.,17881,,,,,0,0,Invalid,,,,,0,0,Invalid
90,500623,"Grencio, Maribel",Alicante St.,,,,,,0,0,Invalid,782,819,37.0,222.0,819,782,Valid
104,500559,"Laurio, Josefa",Alicante St.,,,,,,0,0,Invalid,,,,,0,0,Invalid
128,501878,"Ramiro, Geny",Alicante St.,114604-21,,,,,0,0,Invalid,42,54,12.0,72.0,54,42,Valid
143,500381,UCCP,Alicante St.,955353,,,,,0,0,Invalid,7388,7393,5.0,60.0,7393,7388,Valid
202,500070,"Bartolay, Victor 2",Altarejos St.,,,,,,0,0,Invalid,2952,2952,,60.0,2952,2952,Valid
238,500704,"Sola, Andres Jr.",Altarejos St.,,,,,,0,0,Invalid,DEFECT,791,,60.0,791,DEFECT,Valid
247,501899,"Almoete, Joey",Amican St.,,,,,,0,0,Invalid,16,16,,60.0,16,16,Valid
299,500509,Ticao Dist. Hospital,Amican St.,,,,,900.0,0,0,Invalid,,,,900.0,0,0,Invalid
340,501382,"Balingasa, Maria Elena",Bailon St.,10114796,,,,,0,0,Invalid,,,,,0,0,Invalid


In [None]:
# SEP2022 Records Creation 
import os
import csv
import pandas as pd

# Mount the reference billings
jul_input_df = pd.read_csv("../../dataset/raw/2022/JUL2022.csv", encoding='latin-1')
oct_input_df = pd.read_csv("../../dataset/raw/2022/OCT2022.csv", encoding='latin-1')

# Merge the reference billings
merged_ref_df = pd.merge(jul_input_df, oct_input_df, on=["Control Number", "Consumer\'s Name", "Address"], how="inner")

# Create the output directory if it doesn't exist
sep_output_dir = os.path.dirname("../../dataset/raw/2022/SEP2022.csv")
oct_output_dir = os.path.dirname("../../dataset/raw/2022/OCT2022.csv")
os.makedirs(sep_output_dir, exist_ok=True)
os.makedirs(oct_output_dir, exist_ok=True)

with open("../../dataset/raw/2022/SEP2022.csv", "w", newline="", encoding='latin-1') as sep_file \
    , open("../../dataset/raw/2022/OCT2022.csv", "w", newline="", encoding='latin-1') as oct_file:
