In [2]:
import pandas as pd
import numpy as np


In [3]:
children_registered = pd.read_csv("/Users/vardr.DESKTOP-R06GCLK/CodeOp_Data_Analytics_Mod3on/Group_Project/UK_ed_under_5/data/2_early_years_provision_provider_type_2018_2024.csv")
disadvanted_data = pd.read_csv("/Users/vardr.DESKTOP-R06GCLK/CodeOp_Data_Analytics_Mod3on/Group_Project/UK_ed_under_5/data/3_early_years_provision_disadvantaged_children_2018_2024.csv")

In [4]:
def display_unique_vals(df):
    for column in df.columns:
        unique_values = df[column].unique()
        print(f' Column: {column}')
        print(f'Number of unique values: {len(unique_values)}')
        print(f'Unique values: {unique_values}')
        print('-' * 40)


In [5]:
display_unique_vals(children_registered)

 Column: time_period
Number of unique values: 7
Unique values: [2024 2023 2022 2021 2020 2019 2018]
----------------------------------------
 Column: time_identifier
Number of unique values: 1
Unique values: ['Reporting year']
----------------------------------------
 Column: geographic_level
Number of unique values: 3
Unique values: ['National' 'Regional' 'Local authority']
----------------------------------------
 Column: country_code
Number of unique values: 1
Unique values: ['E92000001']
----------------------------------------
 Column: country_name
Number of unique values: 1
Unique values: ['England']
----------------------------------------
 Column: region_code
Number of unique values: 12
Unique values: [nan 'E12000001' 'E12000002' 'E12000003' 'E12000004' 'E12000005'
 'E12000006' 'E12000007' 'E12000008' 'E12000009' 'E13000001' 'E13000002']
----------------------------------------
 Column: region_name
Number of unique values: 12
Unique values: [nan 'North East' 'North West' 'Yorks

In [6]:
invalid_values = ['c', 'u', 'x', 'z']


Disadvantaged children data cleaning

In [7]:
disadvanted_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82584 entries, 0 to 82583
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   time_period          82584 non-null  int64  
 1   time_identifier      82584 non-null  object 
 2   geographic_level     82584 non-null  object 
 3   country_code         82584 non-null  object 
 4   country_name         82584 non-null  object 
 5   region_code          82080 non-null  object 
 6   region_name          82080 non-null  object 
 7   old_la_code          76536 non-null  float64
 8   new_la_code          76536 non-null  object 
 9   la_name              76536 non-null  object 
 10  entitlement_type     82584 non-null  object 
 11  age                  82584 non-null  object 
 12  year_group           82584 non-null  object 
 13  disadvantage_type    82584 non-null  object 
 14  basis_for_funding    82584 non-null  object 
 15  number_children      82584 non-null 

In [8]:
display_unique_vals(disadvanted_data)

 Column: time_period
Number of unique values: 7
Unique values: [2024 2023 2022 2021 2020 2019 2018]
----------------------------------------
 Column: time_identifier
Number of unique values: 1
Unique values: ['Reporting year']
----------------------------------------
 Column: geographic_level
Number of unique values: 3
Unique values: ['National' 'Regional' 'Local authority']
----------------------------------------
 Column: country_code
Number of unique values: 1
Unique values: ['E92000001']
----------------------------------------
 Column: country_name
Number of unique values: 1
Unique values: ['England']
----------------------------------------
 Column: region_code
Number of unique values: 12
Unique values: [nan 'E12000001' 'E12000002' 'E12000003' 'E12000004' 'E12000005'
 'E12000006' 'E12000007' 'E12000008' 'E12000009' 'E13000001' 'E13000002']
----------------------------------------
 Column: region_name
Number of unique values: 12
Unique values: [nan 'North East' 'North West' 'Yorks

## Filtering out 2-year-olds on children_registered df

In [9]:
def filter_out_age(df):
    """
    Removes rows where the column 'age' is equal to '2-year-olds' from a dataframe.

    Parameters:
    df: The DataFrame to modify

    Returns:
    pandas.DataFrame: The modified DataFrame with rows removed. 
    """
    if 'age' in df.columns:
        return df[df['age']!= '2-year-olds'].reset_index(drop=True)
    


In [45]:
# Check the count of '2-year-olds' before filtering
print(f"Before filtering, count of '2-year-olds': {(children_registered['age'] == '2-year-olds').sum()}")

# Apply the filter and reassign the DataFrame
children_registered = filter_out_age(children_registered)

# Check the count of '2-year-olds' after filtering
print(f"After filtering, count of '2-year-olds': {(children_registered['age'] == '2-year-olds').sum()}")


Before filtering, count of '2-year-olds': 0
After filtering, count of '2-year-olds': 0


In [46]:
def filter_out_age_more(df):
    """
    Removes rows where the column 'age' is equal to '3 and 4-year-olds' from a dataframe.

    Parameters:
    df: The DataFrame to modify

    Returns:
    pandas.DataFrame: The modified DataFrame with rows removed. 
    """
    if 'age' in df.columns:
        return df[df['age']!= '3 and 4-year-olds'].reset_index(drop=True)

In [47]:
children_registered = filter_out_age_more(children_registered)

In [48]:
children_registered["age"].value_counts(dropna=False)

age
3-year-olds    14882
4-year-olds    14882
Name: count, dtype: int64

## Filtering out 2-year-olds on disadvanted_data df

In [49]:
# Check the count of '2-year-olds' before filtering
print(f"Before filtering, count of '2-year-olds': {(disadvanted_data['age'] == '2-year-olds').sum()}")

# Apply the filter and reassign the DataFrame
disadvanted_data = filter_out_age(disadvanted_data)

# Check the count of '2-year-olds' after filtering
print(f"After filtering, count of '2-year-olds': {(children_registered['age'] == '2-year-olds').sum()}")


Before filtering, count of '2-year-olds': 0
After filtering, count of '2-year-olds': 0


## Counting invalid rows after removal of 2-year-olds

In [50]:
def count_invalid_rows(df, column, invalid_values):
    """
    Counts the number of rows with invalid values in a specified column.

    Parameters:
    df (pandas.DataFrame): The DataFrame to check.
    column (str): The column to check for invalid values.
    invalid_values (list): A list of invalid values to search for.

    Returns:
    int: The count of rows with invalid values.
    """
    if 'is_invalid' in df.columns:
              df.drop(columns=['is_invalid'], inplace = True)
    df['is_invalid'] = df[column].isin(invalid_values)
       
    invalid_count = df['is_invalid'].sum()

    return invalid_count
       


In [51]:
count_invalid_rows(children_registered,'number_children',invalid_values)

np.int64(0)

In [52]:
count_invalid_rows(disadvanted_data,'number_children',invalid_values)

np.int64(0)

In [53]:
# children_registered[children_registered['is_invalid'] == True].head(100)

In [54]:
children_registered_columns_check = ['age', 'provider_type', 'provider_type_group']
disadvantaged_columns_check = ['age', 'disadvantage_type', 'basis_for_funding']

def remove_rows_with_total(df, columns_to_check):
    """
    Removes rows containing the value 'total' in specified columns.

    Parameters:
    df (pandas.DataFrame): The DataFrame to process.
    columns_to_check (list): List of column names to check for the value 'total'.

    Returns:
    pandas.DataFrame: The cleaned DataFrame with rows containing 'total' removed.
    """
    indices_to_remove = []

    for idx, row in df.iterrows():
        for column in columns_to_check:
            if str(row[column]) == 'Total':
              indices_to_remove.append(idx)
              break
    df = df.drop(indices_to_remove).reset_index(drop=True)

    return df

In [55]:
children_registered = remove_rows_with_total(children_registered, children_registered_columns_check)
disadvanted_data = remove_rows_with_total(disadvanted_data, disadvantaged_columns_check)

replacing 'z' with NaN

In [56]:
children_registered[['number_children', 'number_providers']] = (children_registered[['number_children', 'number_providers']].replace('z', np.nan))

In [57]:
children_registered[['number_children', 'number_providers']] = children_registered[['number_children', 'number_providers']].astype(float)

In [58]:
# Disadvantaged had more invalid values than children_registered
disadvanted_data[['number_children', 'percentage_children']] = (
    disadvanted_data[['number_children', 'percentage_children']].replace(['z','c','u','low'],np.nan)
    .astype(float))

In [59]:
disadvantaged_data["number_children"].value_counts(dropna=False).get('z', 0)

NameError: name 'disadvantaged_data' is not defined

Removing rows where la code is NaN, this prevents double dipping numbers at national/regional levels


In [60]:
children_registered = children_registered.dropna(subset=['new_la_code']).reset_index(drop =True)
disadvanted_data = disadvanted_data.dropna(subset=['new_la_code']).reset_index(drop=True)

Changing timeperiod from int to datetime

In [61]:
children_registered['time_period'] = pd.to_datetime(children_registered['time_period'], format = '%Y') 
disadvanted_data['time_period'] = pd.to_datetime(disadvanted_data['time_period'], format = '%Y') 

In [62]:
disadvanted_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   time_period          0 non-null      datetime64[ns]
 1   time_identifier      0 non-null      object        
 2   geographic_level     0 non-null      object        
 3   country_code         0 non-null      object        
 4   country_name         0 non-null      object        
 5   region_code          0 non-null      object        
 6   region_name          0 non-null      object        
 7   old_la_code          0 non-null      float64       
 8   new_la_code          0 non-null      object        
 9   la_name              0 non-null      object        
 10  entitlement_type     0 non-null      object        
 11  age                  0 non-null      object        
 12  year_group           0 non-null      object        
 13  disadvantage_type    0 non-null      object    

In [63]:
children_registered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29764 entries, 0 to 29763
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   time_period          29764 non-null  datetime64[ns]
 1   geographic_level     29764 non-null  object        
 2   region_code          29764 non-null  object        
 3   region_name          29764 non-null  object        
 4   new_la_code          29764 non-null  object        
 5   la_name              29764 non-null  object        
 6   entitlement_type     29764 non-null  object        
 7   provider_type_group  29764 non-null  object        
 8   provider_type        29764 non-null  object        
 9   year_group           29764 non-null  object        
 10  age                  29764 non-null  object        
 11  number_children      29764 non-null  float64       
 12  number_providers     29764 non-null  float64       
 13  is_invalid           29764 non-

In [64]:
children_registered = children_registered.drop(columns='is_invalid')
disadvanted_data = disadvanted_data.drop(columns='is_invalid')

## Removing values except "Total" from year_group table of children_registered

In [65]:
children_registered["year_group"].value_counts(dropna=False)

year_group
Total    29764
Name: count, dtype: int64

In [66]:
children_registered = keep_total = children_registered[children_registered["year_group"] == 'Total'] 

In [67]:
children_registered["year_group"].value_counts(dropna=False)

year_group
Total    29764
Name: count, dtype: int64

## remove time_identifier,country_code, country_name, old_la_code columns

In [68]:
children_registered = children_registered.drop(
    ["time_identifier", "country_code","country_name", "old_la_code"], axis = 1
)

KeyError: "['time_identifier', 'country_code', 'country_name', 'old_la_code'] not found in axis"

In [69]:
children_registered

Unnamed: 0,time_period,geographic_level,region_code,region_name,new_la_code,la_name,entitlement_type,provider_type_group,provider_type,year_group,age,number_children,number_providers
0,2024-01-01,Local authority,E12000001,North East,E06000047,County Durham,15-hour entitlement,"Private, voluntary and independent providers",All private and voluntary providers,Total,3-year-olds,2140.0,118.0
1,2024-01-01,Local authority,E12000001,North East,E06000047,County Durham,15-hour entitlement,"Private, voluntary and independent providers",All private and voluntary providers,Total,4-year-olds,692.0,105.0
2,2024-01-01,Local authority,E12000001,North East,E06000047,County Durham,30-hour entitlement,"Private, voluntary and independent providers",All private and voluntary providers,Total,3-year-olds,1389.0,112.0
3,2024-01-01,Local authority,E12000001,North East,E06000047,County Durham,30-hour entitlement,"Private, voluntary and independent providers",All private and voluntary providers,Total,4-year-olds,462.0,100.0
4,2024-01-01,Local authority,E12000001,North East,E06000047,County Durham,15-hour entitlement,"Private, voluntary and independent providers",Independent schools,Total,3-year-olds,21.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
29759,2018-01-01,Local authority,E13000002,Outer London,E09000031,Waltham Forest,30-hour entitlement,State-funded schools,State-funded secondary schools,Total,4-year-olds,8.0,2.0
29760,2018-01-01,Local authority,E13000002,Outer London,E09000031,Waltham Forest,15-hour entitlement,State-funded schools,Special schools,Total,3-year-olds,4.0,2.0
29761,2018-01-01,Local authority,E13000002,Outer London,E09000031,Waltham Forest,15-hour entitlement,State-funded schools,Special schools,Total,4-year-olds,18.0,3.0
29762,2018-01-01,Local authority,E13000002,Outer London,E09000031,Waltham Forest,30-hour entitlement,State-funded schools,Special schools,Total,3-year-olds,0.0,0.0


In [71]:
children_registered.to_csv("/Users/vardr.DESKTOP-R06GCLK/CodeOp_Data_Analytics_Mod3on/Group_Project/Preprocessed_files/children_registered_2.csv", index=False)



In [None]:
#disadvanted_data.to_csv('/Users/mel/Desktop/disadvantaged_data_test.csv', index=False)