In [8]:
import pandas as pd
from pathlib import Path
from IPython.display import display

In [9]:
data_dir = Path('../data/raw')

datasets = {
    'Applications from turkey': pd.read_csv(data_dir / 'asylum_applications_originating_tur.csv'),
    'Applications to turkey': pd.read_csv(data_dir / 'asylum_applications_residing_tur.csv'),
    'Decisions from turkey': pd.read_csv(data_dir / 'asylum_decisions_residing_tur.csv'),
    'Decisions from other countries': pd.read_csv(data_dir / 'asylum_decisions_originating_tur.csv'),
}

applications_datasets = {
    'Applications from turkey': datasets['Applications from turkey'],
    'Applications to turkey': datasets['Applications to turkey']
}

decisions_datasets = {
    'Decisions from turkey': datasets['Decisions from turkey'],
    'Decisions from other countries': datasets['Decisions from other countries']
}

print('Data loaded successfully')
print(f"Number of datasets loaded: {len(datasets)}" )

Data loaded successfully
Number of datasets loaded: 4


In [10]:
# Display basic information about each dataset

for name, df in datasets.items():
    print('-' * 50)
    print(f"\nDataset: {name}")
    print(f"Number of rows: {len(df)}")
    print(f"Number of columns: {len(df.columns)}")
    
    print('First few rows:')
    display(df.head())
print('-' * 50)

--------------------------------------------------

Dataset: Applications from turkey
Number of rows: 1786
Number of columns: 13
First few rows:


Unnamed: 0,Year,Country of Origin Code,Country of Asylum Code,Country of Origin Name,Country of Asylum Name,Procedure Type,Procedure Name,Application Type Code,Application Type,Application Data Type,Application Data,Application Average Persons Per Case,Number of Applications
0,#date+year,#country+code+origin,#country+code+asylum,#country+name+origin,#country+name+asylum,#indicator+procedure_type,#indicator+procedure+name,#indicator+application_type+code,#indicator+application_type,#indicator+application_data_type,#indicator+application_data,#indicator+application_average_persons+num,#inneed+asylum_seekers+applications
1,2000,TUR,ALB,Türkiye,Albania,G,Government,V,Various,C,Cases,0,34
2,2000,TUR,AUS,Türkiye,Australia,G,Government,V,Various,C,Cases,0,75
3,2000,TUR,AUS,Türkiye,Australia,G,Government,V,Various,C,Cases,0,111
4,2000,TUR,AUT,Türkiye,Austria,G,Government,V,Various,P,Persons,0,592


--------------------------------------------------

Dataset: Applications to turkey
Number of rows: 940
Number of columns: 13
First few rows:


Unnamed: 0,Year,Country of Origin Code,Country of Asylum Code,Country of Origin Name,Country of Asylum Name,Procedure Type,Procedure Name,Application Type Code,Application Type,Application Data Type,Application Data,Application Average Persons Per Case,Number of Applications
0,#date+year,#country+code+origin,#country+code+asylum,#country+name+origin,#country+name+asylum,#indicator+procedure_type,#indicator+procedure+name,#indicator+application_type+code,#indicator+application_type,#indicator+application_data_type,#indicator+application_data,#indicator+application_average_persons+num,#inneed+asylum_seekers+applications
1,2000,AFG,TUR,Afghanistan,Türkiye,U,UNHCR,V,Various,P,Persons,0,81
2,2000,DZA,TUR,Algeria,Türkiye,U,UNHCR,V,Various,P,Persons,0,5
3,2000,CHN,TUR,China,Türkiye,U,UNHCR,V,Various,P,Persons,0,11
4,2000,COG,TUR,Congo,Türkiye,U,UNHCR,V,Various,P,Persons,0,5


--------------------------------------------------

Dataset: Decisions from turkey
Number of rows: 902
Number of columns: 16
First few rows:


Unnamed: 0,Year,Country of Origin Code,Country of Asylum Code,Country of Origin Name,Country of Asylum Name,Procedure Type,Procedure Name,Decision Type Code,Decision Data Type,Decision Data,Decisions Average Persons Per Case,Recognized,Complementary Protection,Otherwise Closed,Rejected,Total Decided
0,#date+year,#country+code+origin,#country+code+asylum,#country+name+origin,#country+name+asylum,#indicator+procedure_type,#indicator+procedure+name,#indicator+decision_type+code,#indicator+decision_data_type,#indicator+decision_data,#indicator+decision_average_persons+num,#inneed+asylum_seekers+recognized,#inneed+asylum_seekers+recognized_other,#inneed+asylum_seekers+otherwise_closed,#inneed+asylum_seekers+rejected,#inneed+asylum_seekers+total_decided
1,2000,AFG,TUR,Afghanistan,Türkiye,U,UNHCR,FI,P,Persons,0,29,0,49,24,102
2,2000,DZA,TUR,Algeria,Türkiye,U,UNHCR,FI,P,Persons,0,0,0,5,0,5
3,2000,CHN,TUR,China,Türkiye,U,UNHCR,FI,P,Persons,0,14,0,0,0,14
4,2000,ERI,TUR,Eritrea,Türkiye,U,UNHCR,FI,P,Persons,0,0,0,0,14,14


--------------------------------------------------

Dataset: Decisions from other countries
Number of rows: 1687
Number of columns: 16
First few rows:


Unnamed: 0,Year,Country of Origin Code,Country of Asylum Code,Country of Origin Name,Country of Asylum Name,Procedure Type,Procedure Name,Decision Type Code,Decision Data Type,Decision Data,Decisions Average Persons Per Case,Recognized,Complementary Protection,Otherwise Closed,Rejected,Total Decided
0,#date+year,#country+code+origin,#country+code+asylum,#country+name+origin,#country+name+asylum,#indicator+procedure_type,#indicator+procedure+name,#indicator+decision_type+code,#indicator+decision_data_type,#indicator+decision_data,#indicator+decision_average_persons+num,#inneed+asylum_seekers+recognized,#inneed+asylum_seekers+recognized_other,#inneed+asylum_seekers+otherwise_closed,#inneed+asylum_seekers+rejected,#inneed+asylum_seekers+total_decided
1,2000,TUR,ALB,Türkiye,Albania,G,Government,FI,C,Cases,0,10,0,30,0,40
2,2000,TUR,AUS,Türkiye,Australia,G,Government,AR,C,Cases,0,17,0,5,56,78
3,2000,TUR,AUS,Türkiye,Australia,G,Government,FI,C,Cases,0,38,0,5,99,142
4,2000,TUR,AUT,Türkiye,Austria,G,Government,FA,P,Persons,0,18,0,157,165,340


--------------------------------------------------


In [11]:
def check_data_quality(name, df):
    print(f"\nDataset: {name}")
    print(f"Total duplicated rows: {df.duplicated().sum()}")
    print(f"Duplicate percentage: {(df.duplicated().sum() / len(df) * 100):.2f}%")
    
    print("\nColumn summary:")
    summary_columns = pd.DataFrame({
        'Type': df.dtypes,
        'Missing Values': df.isnull().sum(),
        'Missing %': (df.isnull().mean() * 100).round(2)
    })
    display(summary_columns)

for name, df in datasets.items():
    check_data_quality(name, df)


Dataset: Applications from turkey
Total duplicated rows: 6
Duplicate percentage: 0.34%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,object,0,0.0
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Application Type Code,object,4,0.22
Application Type,object,0,0.0
Application Data Type,object,0,0.0



Dataset: Applications to turkey
Total duplicated rows: 1
Duplicate percentage: 0.11%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,object,0,0.0
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Application Type Code,object,0,0.0
Application Type,object,0,0.0
Application Data Type,object,0,0.0



Dataset: Decisions from turkey
Total duplicated rows: 0
Duplicate percentage: 0.00%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,object,0,0.0
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Decision Type Code,object,0,0.0
Decision Data Type,object,0,0.0
Decision Data,object,0,0.0



Dataset: Decisions from other countries
Total duplicated rows: 0
Duplicate percentage: 0.00%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,object,0,0.0
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Decision Type Code,object,21,1.24
Decision Data Type,object,0,0.0
Decision Data,object,0,0.0


In [12]:
def detect_numeric_object_columns(df, threshold=0.9):
    numeric_like_cols = []

    for col in df.select_dtypes(include='object').columns:
        converted = pd.to_numeric(df[col], errors='coerce')
        ratio = converted.notna().mean()

        if ratio >= threshold:
            numeric_like_cols.append(col)

    return numeric_like_cols

print('-' * 50)
print('Object columns turned into numeric in each dataframe:')
print()
for name, df in datasets.items():
    cols_to_convert = detect_numeric_object_columns(df)
    
    df[cols_to_convert] = df[cols_to_convert].apply(
    pd.to_numeric, errors='coerce'
    )
    
    print(f"{name}: {cols_to_convert}")
print('-' * 50)

for name, df in datasets.items():
    check_data_quality(name, df)

--------------------------------------------------
Object columns turned into numeric in each dataframe:

Applications from turkey: ['Year', 'Application Average Persons Per Case', 'Number of Applications']
Applications to turkey: ['Year', 'Application Average Persons Per Case', 'Number of Applications']
Decisions from turkey: ['Year', 'Decisions Average Persons Per Case', 'Recognized', 'Complementary Protection', 'Otherwise Closed', 'Rejected', 'Total Decided']
Decisions from other countries: ['Year', 'Decisions Average Persons Per Case', 'Recognized', 'Complementary Protection', 'Otherwise Closed', 'Rejected', 'Total Decided']
--------------------------------------------------

Dataset: Applications from turkey
Total duplicated rows: 6
Duplicate percentage: 0.34%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,float64,1,0.06
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Application Type Code,object,4,0.22
Application Type,object,0,0.0
Application Data Type,object,0,0.0



Dataset: Applications to turkey
Total duplicated rows: 1
Duplicate percentage: 0.11%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,float64,1,0.11
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Application Type Code,object,0,0.0
Application Type,object,0,0.0
Application Data Type,object,0,0.0



Dataset: Decisions from turkey
Total duplicated rows: 0
Duplicate percentage: 0.00%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,float64,1,0.11
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Decision Type Code,object,0,0.0
Decision Data Type,object,0,0.0
Decision Data,object,0,0.0



Dataset: Decisions from other countries
Total duplicated rows: 0
Duplicate percentage: 0.00%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,float64,1,0.06
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Decision Type Code,object,21,1.24
Decision Data Type,object,0,0.0
Decision Data,object,0,0.0


In [13]:
# NOTE: From this point forward, we use applications_datasets and decisions_datasets 
# instead of the original 'datasets' dict (which remains unchanged as raw data)

important_columns_applications_datasets = ['Number of Applications', 'Year', 'Country of Origin Name', 'Country of Asylum Name']
important_columns_decisions_datasets = ['Recognized', 'Complementary Protection', 'Otherwise Closed', 'Rejected', 'Total Decided', 'Year', 'Country of Origin Name']

for name, df in applications_datasets.items():
    applications_datasets[name] = df.dropna(subset=important_columns_applications_datasets).drop_duplicates()

for name, df in decisions_datasets.items():
    decisions_datasets[name] = df.dropna(subset=important_columns_decisions_datasets).drop_duplicates()

In [14]:
print("=" * 50)
print("CLEANED DATA SUMMARY")
print("=" * 50)

for name, df in applications_datasets.items():
    print(f"\n{name}:")
    print(f"  Rows: {len(df)}")
    print(f"  Duplicates: {df.duplicated().sum()}")
    print(f"  Missing values in key columns: {df[important_columns_applications_datasets].isnull().sum().sum()}")

for name, df in decisions_datasets.items():
    print(f"\n{name}:")
    print(f"  Rows: {len(df)}")
    print(f"  Duplicates: {df.duplicated().sum()}")
    print(f"  Missing values in key columns: {df[important_columns_decisions_datasets].isnull().sum().sum()}")

CLEANED DATA SUMMARY

Applications from turkey:
  Rows: 1779
  Duplicates: 0
  Missing values in key columns: 0

Applications to turkey:
  Rows: 938
  Duplicates: 0
  Missing values in key columns: 0

Decisions from turkey:
  Rows: 901
  Duplicates: 0
  Missing values in key columns: 0

Decisions from other countries:
  Rows: 1686
  Duplicates: 0
  Missing values in key columns: 0


In [15]:
clean_data_dir = Path("../data/clean")
clean_data_dir.mkdir(exist_ok=True)

all_clean_datasets = {**applications_datasets, **decisions_datasets}

for name, df in all_clean_datasets.items():
    filename = name.replace(" ", "_").lower() + "_clean.csv"
    df.to_csv(clean_data_dir / filename, index=False)
    print(f"Saved: {filename}")

Saved: applications_from_turkey_clean.csv
Saved: applications_to_turkey_clean.csv
Saved: decisions_from_turkey_clean.csv
Saved: decisions_from_other_countries_clean.csv
