In [50]:
# NINO validation script

import pandas as pd
df = pd.read_excel(r"D:\END TO END PROJECTS - GITHUB\National Insurance Number (NINO) validation project\sample_nino_dataset.xlsx")

In [22]:
#print(df.head(10))

        NINO
0  12AB3456C
1   AA123456
2        NaN
3  AA123789A
4  AA12C456A
5  AA234109D
6           
7  AA2341O9D
8  AB!23456A
9  AB123@56C


In [51]:
total_records = len(df)

In [52]:
print('Total records: ',len(df)) 

Total records:  114


In [53]:
# If you only want the number of missing values in the NINO column:
print("NaN count:", df['NINO'].isna().sum())
print("Empty string count:", (df['NINO'] == '').sum())

# If you want to see the actual rows
#print("Rows with NaN:\n", df[df['NINO'].isna()])
#print("Rows with empty string:\n", df[df['NINO'] == ''])


NaN count: 2
Empty string count: 0


In [54]:
# Convert empty NINO values to NA, then remove those rows.
df = df.replace({'NINO',''},pd.NA).dropna(subset='NINO') # in my dataset - 2 rows will be removed

In [55]:
# If you want to clean spaces only in the NINO column:
df['NINO'] = df['NINO'].astype('string').str.strip()

In [56]:
# If you want to convert the NINO column to uppercase:
df['NINO'] = df['NINO'].str.upper()

In [57]:
print('Total records: ',len(df))
print('Unique values: ',df['NINO'].nunique())

Total records:  112
Unique values:  101


In [58]:
# Drop the duplicates if there are any
df = df.drop_duplicates(subset='NINO',keep='first')

In [59]:
print('Total records: ',len(df))
print('Unique values: ',df['NINO'].nunique())

Total records:  101
Unique values:  101


In [60]:
# NINO format Validation:
# Structure: AA999999A - >=8 and max 9 characters

# Prefix rules (first two letters):
# Must be letters only (no digits).
# First letter cannot be: D, F, I, Q, U, V.
# Second letter cannot be: D, F, I, O, Q, U, V.
# Certain prefixes are not used (like BG, GB, NK, KN, TN, NT, ZZ).

# Digits (positions 3–8):
# Always six numeric digits.
# Leading zeros are allowed (e.g., AB001234C).

# Suffix (last character):
# Must be one of: A, B, C, D.
# Sometimes no suffix is used in modern systems (rare).

# Case sensitivity:
# Standard is uppercase (e.g., ab123456c → should be normalized to AB123456C).

import re
def is_valid_NINO(NINO: str) -> bool:
    """Validate a UK National Insurance Number (NINO) format."""
    
    # Must be string
    if not isinstance(NINO, str):
        return False
    
    # Case sensitivity → must be uppercase
    if NINO != NINO.upper():
        return False
    
    # Trim spaces
    NINO = NINO.strip()
    
    # Length check: should be exactly 9 chars
    if len(NINO) != 9:
        return False

     # Regex validation
    if not re.match(
        r"^(?!BG)(?!GB)(?!NK)(?!KN)(?!TN)(?!NT)(?!ZZ)"   # banned prefixes
        r"[A-CEGHJ-PR-TW-Z]{2}"                          # first two letters
        r"\d{6}"                                         # six digits
        r"[A-D]$",                                       # suffix must be A-D
        NINO
    ):
        return False
    
    return True


In [62]:
df['Status'] = df['NINO'].apply(lambda x: "Valid" if is_valid_NINO(x) else "Invalid")
print(df.head(10))

         NINO   Status
0   12AB3456C  Invalid
1    AA123456  Invalid
3   AA123789A    Valid
4   AA12C456A  Invalid
5   AA234109D    Valid
6              Invalid
7   AA2341O9D  Invalid
8   AB!23456A  Invalid
9   AB123@56C  Invalid
10    AB12345  Invalid


In [64]:
valid_count = (df['Status']=='Valid').sum()
invalid_count = (df['Status']=='Invalid').sum()
missing_count = total_records -(valid_count+invalid_count)

In [65]:
print('Total records = ', total_records)
print('Valid = ', valid_count)
print('Invalid = ', invalid_count)
print('Missing = ', missing_count)


Total records =  114
Valid =  32
Invalid =  69
Missing =  13


In [69]:
df_summary = pd.DataFrame({ "TOTAL PROCESSED RECORDS":[total_records]
                           ,"TOTAL VALID COUNT": [valid_count]
                           ,"TOTAL INVALID COUNT": [invalid_count]
                           ,"TOTAL MISSING NINO": [missing_count]})
print(df_summary.head())

with pd.ExcelWriter("NINO VALIDATION RESULT.xlsx") as writer:
    df.to_excel(writer, sheet_name="NINO Validations",index=False)
    df_summary.to_excel(writer, sheet_name="SUMMARY_DATA_NINO",index=False)

   TOTAL PROCESSED RECORDS  TOTAL VALID COUNT  TOTAL INVALID COUNT  \
0                      114                 32                   69   

   TOTAL MISSING NINO  
0                  13  
