In [62]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Loading Census Data

In [63]:
print('=' *60)
print('LOADING CENSUS DATA')
print('=' *60)

df = pd.read_csv('T1_A25census-3.csv')
print('Dataset loaded!')
print(f'Total rows: {len(df)}')
print(f'Total columns: {len(df.columns)}')
print()

df['Age'] = pd.to_numeric(df['Age'], errors = 'coerce')

LOADING CENSUS DATA
Dataset loaded!
Total rows: 8786
Total columns: 12



In [64]:
print('=' *60)
print('INITIAL DATA EXPLORATION')
print('=' *60)

print('First 5 rows:')
print(df.head())
print()

print('Data types and info:')
df.info()
print()

INITIAL DATA EXPLORATION
First 5 rows:
   Unnamed: 0 House Number             Street First Name  Surname   Age  \
0           0            1  Inverness Mission     Darren     Kaur  79.0   
1           1            2  Inverness Mission       Dean   Hughes  66.0   
2           2            3  Inverness Mission     Andrew  Freeman  19.0   
3           3            4  Inverness Mission      Sally     Ward  40.0   
4           4            4  Inverness Mission    Brandon   Thorpe  38.0   

  Relationship to Head of House Marital Status  Gender  \
0                          Head        Widowed    Male   
1                          Head        Widowed    Male   
2                          Head         Single    Male   
3                          Head         Single  Female   
4                       Partner         Single    Male   

                           Occupation Infirmity   Religion  
0  Retired Surveyor, building control       NaN  Christian  
1                    Financial trader  

# Data Quality Assessment

In [65]:
print('=' *60)
print('IDENTIFYING DATA QUALITY ISSUES')
print('=' *60)

print('Missing values BEFORE cleaning:')
missing_before = (df.isnull().sum())
print(missing_before)
print()

IDENTIFYING DATA QUALITY ISSUES
Missing values BEFORE cleaning:
Unnamed: 0                          0
House Number                        0
Street                              0
First Name                          0
Surname                             0
Age                                 3
Relationship to Head of House     563
Marital Status                   2140
Gender                              0
Occupation                          0
Infirmity                        8709
Religion                         5093
dtype: int64



In [66]:
#store original counts for later comparison

original_age_missing = df['Age'].isnull().sum()
original_religion_missing = df['Religion'].isnull().sum()

#convert age to numeric 
print('Converting Age to numeric format...')
df['Age'] = pd.to_numeric(df['Age'], errors = 'coerce')
print('Age converted to numeric')
print()

Converting Age to numeric format...
Age converted to numeric



In [67]:
#check Age statistics

print('Age column check:')
print(f' Missing ages (after conversion); {df['Age'].isnull().sum()}')
print(f' Minimum age: {df['Age'].min()}')
print(f' Maximum age: {df['Age'].max()}')
print(f' Ages over 122 (world record): {len(df[df['Age'] > 122])}')
print(f' Negative ages: {len(df[df['Age'] < 0])}')
print()

#check Religion
print('Religion column check:')
print(f' Missing religions: {df['Religion'].isnull().sum()}')
print(f' Unique religion values: {df['Religion'].nunique()}')
print()

Age column check:
 Missing ages (after conversion); 3
 Minimum age: 0.0
 Maximum age: 105.0
 Ages over 122 (world record): 0
 Negative ages: 0

Religion column check:
 Missing religions: 5093
 Unique religion values: 13



In [68]:
#check logical consistency issues

print('Logical consistency checks:')

young_married = df[(df['Marital Status'] == 'Married') & (df['Age'] < 18)]
print(f' People married under age 18: {len(young_married)}')

child_workers = df[(df['Age'] < 16) & (df['Occupation'] != 'Student') & (df['Occupation'] != 'Child')]
print(f' Child workers (under 16 not student): {len(child_workers)}')

young_heads = df[(df['Relationship to Head of House'] == 'Head') & (df['Age'] < 18)]
print(f' Heads of Household under 18: {len(young_heads)}')
print()                                                             

Logical consistency checks:
 People married under age 18: 0
 Child workers (under 16 not student): 0
 Heads of Household under 18: 2



# Age Column Cleaning

In [69]:
print('=' *60)
print('CLEANING AGE COLUMN')
print('=' *60)

#calculate median for imputation

median_age = df['Age'].median()
print(f' Median age calculated: {median_age} years')
print()

print('Using Median for Age imputation because it is more robust to outliers than mean.')

#fix impossible ages BEFORE imputation

ages_over_122 = len(df[df['Age'] > 122])
ages_negative = len(df[df['Age'] < 0])

if ages_over_122 > 0:
    df.loc[df['Age'] > 122, 'Age'] = median_age
    print(f' Fixed {ages_over_122} ages over 122 years(replaced with median)')

if ages_negative > 0:
    df.loc[df['Age'] < 0, 'Age'] = median_age
    print(f' Fixed {ages_negative} negative ages(replaced with median)')

#filling missing values

ages_missing = df['Age'].isnull().sum()
df['Age'] = df['Age'].fillna(median_age)
print(f'Filled {ages_missing} missing ages with median: {median_age}')
print()

#converting to integer

df['Age'] = df['Age'].astype(int)
print('Age converted to integer')
print()

#verifying Age cleaning 

print('Age statistics AFTER cleaning:')
print(f' Missing values: {df['Age'].isnull().sum()}')
print(f' Minimum age: {df['Age'].min()}')
print(f' Maximum age: {df['Age'].max()}')
print(f' Mean age: {df['Age'].mean():.2f}')
print(f' Median age: {df['Age'].median()}')
print()

CLEANING AGE COLUMN
 Median age calculated: 35.0 years

Using Median for Age imputation because it is more robust to outliers than mean.
Filled 3 missing ages with median: 35.0

Age converted to integer

Age statistics AFTER cleaning:
 Missing values: 0
 Minimum age: 0
 Maximum age: 105
 Mean age: 35.54
 Median age: 35.0



# Religion Column Cleaning (Two-Stage Process)

In [70]:
print('=' *60)
print('CLEANING RELIGION COLUMN')
print('=' *60)

print('Checking for invalid Religion entries')

valid_religions = ['Christian', 'Catholic', 'Methodist', 'Muslim', 'Jewish', 'Sikh', 'Bahai', 'Buddhist', 'Hindu', 'Pagan', 'None']

invalid_mask = ~ df['Religion'].isin(valid_religions) & df['Religion'].notna()
invalid_count = invalid_mask.sum()
print(f'Invalid religion entries found: {invalid_count}')

if invalid_count > 0:
    print('Examples of invalid religion values:')
    print(df[invalid_mask]['Religion'].unique())
    print()

#replace invalid entries with 'Other'
    df.loc[invalid_mask, 'Religion'] = 'Other'
    print(f'Replaced {invalid_count} invalid religion entries with "Other"')
    print()

#calculating mode for filling missing values
mode_religion = df['Religion'].mode()[0]
print(f'Most common religion (mode): {mode_religion}')
print()

print('Using Mode for Religion imputation because mode preserves the most common pattern')

#filling missing religions
religion_missing = df['Religion'].isnull().sum()
df['Religion'] = df['Religion'].fillna(mode_religion)
print(f'Filled {religion_missing} missing religions with mode: {mode_religion}')
print()

#verifying religion cleaning
print('Religion distribution AFTER cleaning:')
print(df['Religion'].value_counts())
print()
print(f'Missing religions now: {df['Religion'].isnull().sum()}')
print()

CLEANING RELIGION COLUMN
Checking for invalid Religion entries
Invalid religion entries found: 9
Examples of invalid religion values:
['Buddist' 'Housekeeper' ' ' 'Sith' 'Nope']

Replaced 9 invalid religion entries with "Other"

Most common religion (mode): Christian

Using Mode for Religion imputation because mode preserves the most common pattern
Filled 5093 missing religions with mode: Christian

Religion distribution AFTER cleaning:
Religion
Christian    7046
Catholic      895
Methodist     613
Muslim        123
Jewish         48
Sikh           47
Other           9
Bahai           3
Pagan           2
Name: count, dtype: int64

Missing religions now: 0



# Logical Consistency Fixes

In [71]:
print('=' *60)
print('FIXING LOGICAL CONSISTENCY ISSUES')
print('=' *60)

#fix 1 - young married couple
young_married_count = len(df[(df['Marital Status'] == 'Married') & (df['Age'] < 18)])

if young_married_count > 0:
    print(f'Issue: {young_married_count} people married under 18')
    print('Action: Setting marital status to NA (not applicable for minors)')
    print('Note: Marriage under 18 is illegal in UK, so marital status does not apply')
    df.loc[(df['Marital Status'] == 'Married') & (df['Age'] < 18), 'Marital Status'] = 'NA'
    print('Fixed: Changed marital status to NA')
    print()
else:
    print('No people married under 18 found')
    print()

#fix 2 - child workers
child_workers_count =len(df[(df['Age'] < 16) & (df['Occupation'] != 'Student') & (df['Occupation'] != 'Child')])
print(f'Identified: {child_workers_count} child workers (under 16)')
if child_workers_count > 0:
    print('These cases will be investigated later in report as it may cause data errors.')

#fix 3 - children marital status
children_marital_fix = df.loc[df['Age'] < 18, 'Marital Status'].isnull().sum()
if children_marital_fix > 0:
    df.loc[df['Age'] < 18, 'Marital Status'] = df.loc[df['Age'] < 18, 'Marital Status'].fillna('NA')
    print(f'Set {children_marital_fix} children marital status to "NA"')
    print()

#fix 4 - young heads of house
young_heads_count = len(df[(df['Relationship to Head of House'] == 'Head') & (df['Age'] < 18)])
if young_heads_count > 0:
    print(f'Identified: {young_heads_count} heads of household under 18. Review in report')

FIXING LOGICAL CONSISTENCY ISSUES
No people married under 18 found

Identified: 0 child workers (under 16)
Set 2140 children marital status to "NA"

Identified: 2 heads of household under 18. Review in report


In [72]:
#data cleaning verification 
print('=' * 60)
print('VERIFICATION AFTER CLEANING')
print('=' * 60)

print('Before vs After')
print('Age column')

print(f' Before leaning: {original_age_missing} missing values')
print(f' After cleaning: {df['Age'].isnull().sum()} missing values')
print(f' Improvement: {original_age_missing - df['Age'].isnull().sum()} values fixed')
print()

print('Religion column:')
print(f' Before cleaning: {original_religion_missing} missing values')
print(f' After cleaning: {df['Religion'].isnull().sum()} missing values')
print(f' Improvement: {original_religion_missing - df['Religion'].isnull().sum()} values fixed')
print()

print('Logical Consistency: Verification')
print()

young_married_after = df[(df['Marital Status'] == 'Married') & (df['Age'] < 18)]
print(f'People married under 18: {len(young_married_after)}')

child_workers_after = df[(df['Age'] < 16) & (df['Occupation'] != 'Student') & (df['Occupation'] != 'Child')]
print(f'Child workers: {len(child_workers_after)}')
print()

print('All missing values after cleaning:')
print(df.isnull().sum())
print()

print('Data cleaning complete!')
print()

VERIFICATION AFTER CLEANING
Before vs After
Age column
 Before leaning: 3 missing values
 After cleaning: 0 missing values
 Improvement: 3 values fixed

Religion column:
 Before cleaning: 5093 missing values
 After cleaning: 0 missing values
 Improvement: 5093 values fixed

Logical Consistency: Verification

People married under 18: 0
Child workers: 0

All missing values after cleaning:
Unnamed: 0                          0
House Number                        0
Street                              0
First Name                          0
Surname                             0
Age                                 0
Relationship to Head of House     563
Marital Status                      0
Gender                              0
Occupation                          0
Infirmity                        8709
Religion                            0
dtype: int64

Data cleaning complete!



# Key Findings

In [81]:
print('=' * 60)
print('EARLY ANALYSIS & KEY FINDINGS')
print('=' * 60)

#Analysis 1: Age Statistics
print('Age Statistics')
mean_age = df['Age'].mean()
median_age = df['Age'].median()
min_age = df['Age'].min()
max_age = df['Age'].max()

print(f'Mean age: {mean_age:.2f} years')
print(f'Median age: {median_age:.0f} years')
print(f'Age range: {min_age} to {max_age} years')
print()
print('Interpretation')
if mean_age < 40:
    print(f'This indicates a relatively young population (mean = {mean_age:.1f})')
else:
    print(f'This indicates a mature population (mean = {mean_age:.1f})')
print(f'The mean age of {mean_age:.1f} is comparable to typical UK demographics.')
print()

#Analysis 2:Gender Distribution
print('GENDER DISTRIBUTION')
gender_counts = df['Gender'].value_counts()
total_people = len(df)

#show top 2 genders with percentages
for gender in ['Female', 'Male']:
    if gender in gender_counts:
        count = gender_counts[gender]
        percentage = (count / total_people) * 100
        print(f'{gender}: {count:,} people ({percentage:.1f}%)')

print()
print('Interpretation')
female_pct = (gender_counts.get('Female', 0) / total_people) * 100
male_pct = (gender_counts.get('Male', 0) / total_people) * 100
print(f'Gender distribution is nearly balanced ({female_pct:.1f}% / {male_pct:.1f}%)')
print('This aligns with typical UK population patterns.')
print()

#analysis 3 - Marital status 
print('MARITAL STATUS DISTRIBUTION')
marital_counts = df['Marital Status'].value_counts()

#top 4 matital statuses with percentages
for status in ['Single', 'Married', 'Divorced', 'Widowed']:
    if status in marital_counts:
        count = marital_counts[status]
        percentage = (count / total_people) * 100
        print(f'{status}: {count:,} people ({percentage:.1f}%)')

print()
#analysis 4 - Religion
print('RELIGION DISTRIBUTION')
religion_counts = df['Religion'].value_counts()

#top 5 religions with percentages
for religion, count in religion_counts.head(5).items():
    percentage = (count / total_people) * 100
    print(f'{religion}: {count:,} people ({percentage:.1f}%)')

print()
print('Interpretation')
top_religion = religion_counts.index[0]
top_pct = (religion_counts.iloc[0] / total_people) * 100
print(f'{top_religion} is the most common religion at {top_pct:.1f}%')
print('Religious diversity present with mulitple faiths represented.')
print()

#analysis 5 - Age by Marital Status
print('AVERAGE AGE BY MARITAL STATUS')
age_by_marital = df.groupby('Marital Status')['Age'].mean().sort_values(ascending=False)
for status, age in age_by_marital.head(4).items():
    print(f'{status}: {age:.1f} years average')

print()
print('Interpretation')
print('Age patterns show expected progression:')
print('Widowed indiduals are oldest')
print('Married individuals are older than single (expected marriage age patterns)')
print()

#analysis 6 - Household information 
print('HOUSEHOLD CHARACTERISTICS')
household_sizes = df.groupby(['Street', 'House Number']).size()
avg_household = household_sizes.mean()
max_household = household_sizes.max()
min_household = household_sizes.min()

print(f'Average household size: {avg_household:.2f} people')
print(f'Largest household: {max_household} people')
print(f'Smallest household: {min_household} person')
print(f'Total households: ~{len(household_sizes):,}')
print()
print('Interpretation')
print(f'Average of {avg_household: 2f} people per household is typical for UK.')
print('UK national average is approximately 2.4 people per household)')
if max_household > 15:
    print(f'The largest household ({max_household} people is unusual and warrants further investigation')
print()

#analysis 7 - top occupations
print('TOP 10 OCCUPATIONS')
occupation_counts = df['Occupation'].value_counts().head(10)
for i, (occupation, count) in enumerate(occupation_counts.items(), 1):
    percentage = (count / total_people) * 100
    print(f'{i:2d}. {occupation}: {count} people ({percentage:.1f}%)')

#analysis 8 - lodgers and visitors analysis
print('LODGERS AND VISITORS')
lodgers = df[df['Relationship to Head of House'].isin(['Lodger', 'Visitor'])]
lodger_count = len(lodgers)
lodger_pct = (lodger_count / len(df)) * 100

print(f' Total lodgers/visitors: {lodger_count:,} ({lodger_pct:.1f}%)')
print(f' this indicates housing pressure in the town')

print()
print('Interpretation')
top_occupation = occupation_counts.index[0]
top_occ_count = occupation_counts.iloc[0]
top_occ_pct = (top_occ_count / total_people) * 100
print(f'{top_occupation} is the most common occupation at {top_occ_pct:.1f}%')

#count all student types
student_total = df[df['Occupation'].str.contains('Student', case=False, na=False)].shape[0]
student_pct = (student_total / total_people) * 100
print(f'Total in education (all student types): {student_total} ({student_pct:.1f}%)')
print('This suggest a young, education-focused population.')
print()

EARLY ANALYSIS & KEY FINDINGS
Age Statistics
Mean age: 35.54 years
Median age: 35 years
Age range: 0 to 105 years

Interpretation
This indicates a relatively young population (mean = 35.5)
The mean age of 35.5 is comparable to typical UK demographics.

GENDER DISTRIBUTION
Female: 4,483 people (51.0%)
Male: 4,042 people (46.0%)

Interpretation
Gender distribution is nearly balanced (51.0% / 46.0%)
This aligns with typical UK population patterns.

MARITAL STATUS DISTRIBUTION
Single: 3,031 people (34.5%)
Married: 2,482 people (28.2%)
Divorced: 754 people (8.6%)
Widowed: 355 people (4.0%)

RELIGION DISTRIBUTION
Christian: 7,046 people (80.2%)
Catholic: 895 people (10.2%)
Methodist: 613 people (7.0%)
Muslim: 123 people (1.4%)
Jewish: 48 people (0.5%)

Interpretation
Christian is the most common religion at 80.2%
Religious diversity present with mulitple faiths represented.

AVERAGE AGE BY MARITAL STATUS
M: 77.5 years average
W: 70.6 years average
Widowed: 65.3 years average
Married: 50.3 ye

# UK Comparison

In [82]:
print('=' * 60)
print('COMPARING TO UK NATIONAL AVERAGES')
print('=' * 60)

#this town stats
town_mean_age = df['Age'].mean()
town_household = df.groupby(['Street', 'House Number']).size().mean()
students = len(df[df['Occupation'].str.contains('Student', case=False, na=False)])
unemployed = len(df[df['Occupation'] == 'Unemployed'])
working_age = len (df[(df['Age'] >= 18) & (df['Age'] <= 64)])

#calculating rates
student_rate = (students / len(df)) * 100
unemp_rate = (unemployed / working_age) * 100

#UK averages (from census 2021 & ONS)
uk_mean_age = 40.7
uk_household = 2.36
uk_unemp_rate = 4.2
uk_student_rate = 4.8 

print('COMPARISON TABLE:')
print('-' * 70)
print(f'{"Metric":<30} {"This Town":>15} {"UK Average":>15} {"Difference":>10}')
print('-' * 70)

#Age
diff = town_mean_age - uk_mean_age
print(f'{'Mean age (years)':<30} {town_mean_age:>15.1f} {uk_mean_age:>15.1f} {diff:>10.1f}')

#Household
diff = town_household - uk_household
print(f'{'Household size':<30} {town_household:>15.2f} {uk_household:>15.2f} {diff:>10.2f}')

#Students
diff = student_rate - uk_student_rate
print(f'{'% Students':<30} {student_rate:>14.1f} {uk_student_rate:>14.1f} {diff:>9.1f}%')

#Unemployment
diff = unemp_rate - uk_unemp_rate
print(f'{'Unemployment rate':<30} {unemp_rate:>14.1f}% {uk_unemp_rate:>14.1f}% {diff:>9.1f}%')
print('-' * 70)

print('KEY FINDINGS')
if town_mean_age < uk_mean_age - 3:
    age_diff = uk_mean_age - town_mean_age
    print(f'• Town is YOUNGER than UK average by {age_diff:.1f} years')

if unemp_rate > uk_unemp_rate + 2:
    pct_increase = ((unemp_rate - uk_unemp_rate) / uk_unemp_rate) * 100
    print(f'• Unemployement is {unemp_rate - uk_unemp_rate:.1f} points higher than UK')
    print (f' This is a {pct_increase:.0f}% increase over national rate')

if student_rate > uk_student_rate + 10:
    print(f'• Student population is significatly higher  ({student_rate:.1f}% vs {uk_student_rate:.1f}%)')
    print(f' This suggests education-focused community with lots of commuters')

if town_household > uk_household + 0.3:
    print(f'• Households are larger than UK average ({town_household:.2f} vs {uk_household:.2f})')

print()

COMPARING TO UK NATIONAL AVERAGES
COMPARISON TABLE:
----------------------------------------------------------------------
Metric                               This Town      UK Average Difference
----------------------------------------------------------------------
Mean age (years)                          35.5            40.7       -5.2
Household size                            2.87            2.36       0.51
% Students                               25.9            4.8      21.1%
Unemployment rate                         9.8%            4.2%       5.6%
----------------------------------------------------------------------
KEY FINDINGS
• Town is YOUNGER than UK average by 5.2 years
• Unemployement is 5.6 points higher than UK
 This is a 133% increase over national rate
• Student population is significatly higher  (25.9% vs 4.8%)
 This suggests education-focused community with lots of commuters
• Households are larger than UK average (2.87 vs 2.36)



# Statistical Significance Testing

In [83]:
from scipy import stats
import numpy as np

print('=' * 60)
print('STATISTICAL SIGNIFICANCE TESTING')
print('=' * 60)

#Unemployment z-test
town_unemployed = 449
town_working_age = 4984
town_unemp_rate = town_unemployed / town_working_age
uk_unemp_rate = 0.042

#two-proportion z-test
z_unemp = (town_unemp_rate - uk_unemp_rate) / np.sqrt(uk_unemp_rate * (1-uk_unemp_rate) / town_working_age)
p_unemp = 2 * (1 - stats.norm.cdf(abs(z_unemp)))

print(f'Unemployment Test')
print(f' Town rate: {town_unemp_rate:.1%}')
print(f' UK rate: {uk_unemp_rate:.2f}')
print(f' Z-statistic: {z_unemp:.2f}')
print(f' P-value: {p_unemp:.4f}')

#Age t-test
town_age_mean = df['Age'].mean()
town_age_std = df['Age'].std()
uk_age_mean = 40.7
n = len(df)

t_age = (town_age_mean - uk_age_mean) / (town_age_std / np.sqrt(n))
p_age = 2 * (1 - stats.t.cdf(abs(t_age), n-1))

print(f'Age Test:')
print(f' Town mean: {town_age_mean:.1f} years')
print(f' UK mean: {uk_age_mean} years')
print(f' T-statistics: {t_age:.2f}')
print(f' P-value: {p_age:.4f}')

#Household size test
town_household_size = df.groupby(['Street', 'House Number']).size().mean()
uk_household_size = 2.36
household_std = df.groupby(['Street', 'House Number']).size().std()
n_households = df.groupby(['Street', 'House Number']).ngroups

t_household = (town_household_size - uk_household_size) / (household_std / np.sqrt(n_households))
p_household = 2 * (1 - stats.t.cdf(abs(t_household), n_households-1))

print(f'Household Size Test:')
print(f' Town mean:  {town_household_size:.2f} people')
print(f' UK mean: {uk_household_size} people')
print(f' T-statistic: {t_household:.2f}')
print(f' P-value: {p_household:.4f}')

#student population z-test
students_count = len(df[df['Occupation'] == 'Student'])
working_age_adults = len(df[(df['Age'] >= 18) & (df['Age'] < 65)])
town_students = students_count
town_working_adults = working_age_adults
town_student_rate = town_students / town_working_adults
uk_student_rate = 0.082

#two-proportion z-test
z_students = (town_student_rate - uk_student_rate) / np.sqrt(uk_student_rate * (1-uk_student_rate) / town_working_adults)
p_students = 2 * (1 - stats.norm.cdf(abs(z_students)))

print(f'Student Population Test:')
print(f' Town rate:  {town_student_rate:.1%}')
print(f' UK rate: {uk_student_rate:.1%}')
print(f' Z-statistic: {z_students:.2f}')
print(f' P-value: {p_students:.4f}')

print('All tests shows p<0.001, confirming significant differences from UK norms.')

STATISTICAL SIGNIFICANCE TESTING
Unemployment Test
 Town rate: 9.0%
 UK rate: 0.04
 Z-statistic: 16.92
 P-value: 0.0000
Age Test:
 Town mean: 35.5 years
 UK mean: 40.7 years
 T-statistics: -22.53
 P-value: 0.0000
Household Size Test:
 Town mean:  2.87 people
 UK mean: 2.36 people
 T-statistic: 15.88
 P-value: 0.0000
Student Population Test:
 Town rate:  29.6%
 UK rate: 8.2%
 Z-statistic: 59.08
 P-value: 0.0000
All tests shows p<0.001, confirming significant differences from UK norms.


# Birth and Death Rate Estimation

In [84]:
print('=' * 60)
print('BIRTH ANND DEATH RATE ESTIMATION')
print('=' * 60)

#birth rate - children 0-4 years
children_0_4 = len(df[df['Age'] <= 4])
total_pop = len(df)

#estimate annual births
annual_births = children_0_4 / 5
birth_rate_per_1000 = (annual_births / total_pop) * 1000

print(f'Birth Rate:')
print(f' Children aged 0-4: {children_0_4}')
print(f' Estimated annual birthdate: {annual_births:.0f}')
print(f' Birth rate: {birth_rate_per_1000:.1f} per 1,000 people')

#death rate
elderly_65_69 = len(df[(df['Age'] >= 65) & (df['Age'] < 70)])
elderly_70_74 = len(df[(df['Age'] >= 70) & (df['Age'] < 75)])
elderly_75_79 = len(df[(df['Age'] >= 75) & (df['Age'] < 80)])
elderly_80_84 = len(df[(df['Age'] >= 80) & (df['Age'] < 85)])

#estimate deaths from cohort decline
deaths_per_5yr = elderly_65_69 - elderly_70_74
annual_deaths = deaths_per_5yr / 5
death_rate_per_1000 = (annual_deaths / total_pop) * 1000

print(f'Death Rate (estimated):')
print(f' 65-69 age group: {elderly_65_69}')
print(f' 70-74 age group: {elderly_70_74}')
print(f' Estimated annual deaths: {annual_deaths:.0f}')
print(f' Death rate: {death_rate_per_1000:.1f} per 1,000 people')

print(f'Net population chnage: +{birth_rate_per_1000 - death_rate_per_1000:.1f} per 1,000')

BIRTH ANND DEATH RATE ESTIMATION
Birth Rate:
 Children aged 0-4: 543
 Estimated annual birthdate: 109
 Birth rate: 12.4 per 1,000 people
Death Rate (estimated):
 65-69 age group: 286
 70-74 age group: 231
 Estimated annual deaths: 11
 Death rate: 1.3 per 1,000 people
Net population chnage: +11.1 per 1,000


In [85]:
print('=' * 60)
print('SAVING CLEANED DATA')
print('=' * 60)

df.to_csv('census_cleaned.csv', index = False)
print('Cleaned data saved to: census_cleaned.csv')
print(f' Rows: {len(df):,}')
print(f' Columns: {len(df.columns)}')
print()

SAVING CLEANED DATA
Cleaned data saved to: census_cleaned.csv
 Rows: 8,786
 Columns: 12



# Visualizations

In [86]:
plt.figure(figsize=(10, 6))
plt.hist(df['Age'], bins=30, edgecolor = 'black', color = 'skyblue')
plt.xlabel('Age (years)')
plt.ylabel('Number of People')
plt.title('Age Distribution in Census Data')
plt.grid(True, alpha = 0.3, axis = 'y')
plt.tight_layout()
plt.savefig('viz_age_distribution.png', dpi = 300, bbox_inches = 'tight')
plt.close()
print('viz_age_distribution.png')

plt.figure(figsize=(8, 6))
sns.countplot(data= df, x = 'Gender', order = ['Female', 'Male'])
plt.title('Gender Distribution')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.tight_layout()
plt.savefig('viz_gender.png', dpi = 300, bbox_inches = 'tight')
plt.close()
print('viz_gender.png')

plt.figure(figsize=(10, 6))
sns.countplot(data= df, x = 'Marital Status', order = ['Single', 'Married', 'Divorced', 'Widowed', 'NA'])
plt.title('Marital Status Distribution')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.savefig('viz_marital_status.png', dpi = 300, bbox_inches = 'tight')
plt.close()
print('viz_marital_status.png')

plt.figure(figsize=(10, 6))
sns.boxplot(data= df, x = 'Marital Status', y = 'Age', order = ['Single', 'Married', 'Divorced', 'Widowed'])
plt.title('Age Distribution by Marital Status')
plt.xlabel('Marital Status')
plt.ylabel('Age (years)')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.savefig('viz_age_by_marital_status.png', dpi = 300, bbox_inches = 'tight')
plt.close()
print('viz_marital_status.png')

plt.figure(figsize=(10, 6))
sns.countplot(data= df, x = 'Marital Status', hue = 'Gender', order = ['Single', 'Married', 'Divorced', 'Widowed'], hue_order = ['Female', 'Male'])
plt.title('Marital Status by Gender')
plt.xlabel('Marital Status')
plt.ylabel('Count')
plt.xticks(rotation = 45, ha = 'right')
plt.tight_layout()
plt.savefig('viz_marital_by_gender.png', dpi = 300, bbox_inches = 'tight')
plt.close()
print('viz_marital_by_gender.png')

plt.figure(figsize=(8, 6))
sns.boxplot(data= df, x = 'Gender', y = 'Age', order = ['Female', 'Male'])
plt.title('Age Distribution by Gender')
plt.xlabel('Gender')
plt.ylabel('Age (years)')
plt.tight_layout()
plt.savefig('viz_age_by_gender.png', dpi = 300, bbox_inches = 'tight')
plt.close()
print('viz_age_by_gender.png')

plt.figure(figsize=(10, 6))
religion_counts = df['Religion'].value_counts().sort_values(ascending=False)
plt.bar(religion_counts.index, religion_counts.values, color='lightcoral', edgecolor='black')
plt.title('Religion Distribution After Cleaning')
plt.xlabel('Religion')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig('viz_religion.png', dpi=300, bbox_inches='tight')
plt.close()
print('viz_religion.png saved!')

print()
print('All visualizations saved!')
print()

viz_age_distribution.png
viz_gender.png
viz_marital_status.png
viz_marital_status.png
viz_marital_by_gender.png
viz_age_by_gender.png
viz_religion.png saved!

All visualizations saved!

