In [1]:
import pandas as pd

#Load the two Excel files
file_path1='XLS_Child-labour-database_July-2024.xlsx'
file_path2='XLS_Child-marriage-database_May-2024.xlsx'


df1=pd.read_excel(file_path1,sheet_name='Child labour')
df2=pd.read_excel(file_path2,sheet_name='Child marriage')


In [6]:
#Merge the two datasets on a common column
#Replace 'common_column'with the actual column name present in the box

merged_df=pd.merge(df1,df2,how='outer', on='Countries and areas')

In [7]:
#check for duplicates and remove them
merged_df.drop_duplicates(inplace=True)

In [8]:
#check for missing data
missing_data_summary=merged_df.isnull().sum()

In [9]:
#Eliminate mismatches(removing rows with missing values)
merged_df.dropna(inplace=True)

In [10]:
merged_df

Unnamed: 0,Countries and areas,Child labour (%)+ \n(2015-2023)*,Unnamed: 2_x,Unnamed: 3,Unnamed: 4_x,Unnamed: 5,Unnamed: 6,Unnamed: 7,Married by 15,Unnamed: 2_y,Married by 18,Unnamed: 4_y,Reference year,Observation footnote,Data source,Married by 18.1,Unnamed: 9,Reference year.1,Observation footnote.1,Data source.1
18,Belize,3.3,x,3.9,x,2.6,x,"CAS 2013, UNICEF and ILO calculations",6.3,y,33.5,y,2015-16,Includes reference to visiting unions,MICS 2015-16,22.2,y,2015-16,Includes reference to visiting unions,MICS 2015-16
19,Benin,19.9,y,20.4,y,19.4,y,MICS 2021-22,5.9,y,27.5,y,2021-22,Two clusters could not be visited due to insec...,MICS 2021-22,4.6,y,2021-22,Two clusters could not be visited due to insec...,MICS 2021-22
130,Nigeria,31.5,y,33.0,y,30.0,y,MICS 2021,12.3,y,30.3,y,2021,"Due to prolonged insecurity concerns, some par...",MICS 2021,1.6,y,2021,"Due to prolonged insecurity concerns, some par...",MICS 2021
135,Pakistan,11.4,y,12.5,y,10.1,y,"LFS 2017-18, UNICEF and ILO calculations",3.6,y,18.3,y,2017-18,Azad Jammu and Kashmir (AJK) and Gilgit Baltis...,DHS 2017-18,4.7,y,2017-18,Azad Jammu and Kashmir (AJK) and Gilgit Baltis...,DHS 2017-18


In [13]:
#clean line breaks, spaces, and special characters
#assuming text columns need this cleaning

def clean_text(text):
    if isinstance(text,str):
        #Remove line breaks and extra spaces
        text=' '.join(text.split())
        #Remove special characters (keeping only alphanumeric and spaces)
        text=''.join(e for e in text if e.isalnum() or e.isspace())
    return text

#Apply the cleaning function to all columns
merged_df=merged_df.applymap(clean_text)

In [14]:
merged_df

Unnamed: 0,Countries and areas,Child labour (%)+ \n(2015-2023)*,Unnamed: 2_x,Unnamed: 3,Unnamed: 4_x,Unnamed: 5,Unnamed: 6,Unnamed: 7,Married by 15,Unnamed: 2_y,Married by 18,Unnamed: 4_y,Reference year,Observation footnote,Data source,Married by 18.1,Unnamed: 9,Reference year.1,Observation footnote.1,Data source.1
18,Belize,3.3,x,3.9,x,2.6,x,CAS 2013 UNICEF and ILO calculations,6.3,y,33.5,y,201516,Includes reference to visiting unions,MICS 201516,22.2,y,201516,Includes reference to visiting unions,MICS 201516
19,Benin,19.9,y,20.4,y,19.4,y,MICS 202122,5.9,y,27.5,y,202122,Two clusters could not be visited due to insec...,MICS 202122,4.6,y,202122,Two clusters could not be visited due to insec...,MICS 202122
130,Nigeria,31.5,y,33.0,y,30.0,y,MICS 2021,12.3,y,30.3,y,2021,Due to prolonged insecurity concerns some part...,MICS 2021,1.6,y,2021,Due to prolonged insecurity concerns some part...,MICS 2021
135,Pakistan,11.4,y,12.5,y,10.1,y,LFS 201718 UNICEF and ILO calculations,3.6,y,18.3,y,201718,Azad Jammu and Kashmir AJK and Gilgit Baltista...,DHS 201718,4.7,y,201718,Azad Jammu and Kashmir AJK and Gilgit Baltista...,DHS 201718


In [15]:
#Save the cleaned data to a new Excel file
cleaned_file_path='Cleaned_combined_data.xlsx'
merged_df.to_excel(cleaned_file_path,index=False)

In [16]:
#Print summary of missing data before cleaning
print("Summary of missing data before cleaning")
print(missing_data_summary)

Summary of missing data before cleaning
Countries and areas                   1
Child labour (%)+ \n(2015-2023)*      2
Unnamed: 2_x                        188
Unnamed: 3                            2
Unnamed: 4_x                        188
Unnamed: 5                            2
Unnamed: 6                          188
Unnamed: 7                          112
Married by 15                        16
Unnamed: 2_y                        173
Married by 18                        16
Unnamed: 4_y                        171
Reference year                       78
Observation footnote                195
Data source                          78
Married by 18.1                      16
Unnamed: 9                          185
Reference year.1                    120
Observation footnote.1              208
Data source.1                       120
dtype: int64


In [17]:
#Print summary of the cleaned data
print("Summary of cleaned data:")
print(merged_df.info())

Summary of cleaned data:
<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, 18 to 135
Data columns (total 20 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Countries and areas              4 non-null      object 
 1   Child labour (%)+ 
(2015-2023)*  4 non-null      float64
 2   Unnamed: 2_x                     4 non-null      object 
 3   Unnamed: 3                       4 non-null      float64
 4   Unnamed: 4_x                     4 non-null      object 
 5   Unnamed: 5                       4 non-null      float64
 6   Unnamed: 6                       4 non-null      object 
 7   Unnamed: 7                       4 non-null      object 
 8   Married by 15                    4 non-null      float64
 9   Unnamed: 2_y                     4 non-null      object 
 10  Married by 18                    4 non-null      float64
 11  Unnamed: 4_y                     4 non-null      object 
 12  Ref