In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from IPython.display import display

In [None]:
sns.set_style(style="whitegrid")
data_dir = Path("../data/raw")

datasets = {
    "Applications from turkey": pd.read_csv(data_dir / "asylum_applications_originating_tur.csv"),
    "Applications to turkey": pd.read_csv(data_dir / "asylum_applications_residing_tur.csv"),
    "Decisions from turkey": pd.read_csv(data_dir / "asylum_decisions_residing_tur.csv"),
    "Decisions from other countries": pd.read_csv(data_dir / "asylum_decisions_originating_tur.csv"),
}

primary_datasets = {
    "Applications to turkey": datasets["Applications to turkey"],
    "Decisions from turkey": datasets["Decisions from turkey"],
}

secondary_datasets = {
    "Applications from turkey": datasets["Applications from turkey"],
    "Decisions from other countries": datasets["Decisions from other countries"],
}

print("Data loaded successfully")
print(f"Number of datasets loaded: {len(datasets)}" )

Data loaded successfully
Number of datasets loaded: 4


In [12]:
# Display basic information about each dataset

for name, df in primary_datasets.items():
    print("-" * 50)
    print(f"\nDataset: {name}")
    print(f"Number of rows: {len(df)}")
    print(f"Number of columns: {len(df.columns)}")
    
    print("First few rows:")
    display(df.head())
print("-" * 50)

--------------------------------------------------

Dataset: Applications to turkey
Number of rows: 940
Number of columns: 13
First few rows:


Unnamed: 0,Year,Country of Origin Code,Country of Asylum Code,Country of Origin Name,Country of Asylum Name,Procedure Type,Procedure Name,Application Type Code,Application Type,Application Data Type,Application Data,Application Average Persons Per Case,Number of Applications
0,#date+year,#country+code+origin,#country+code+asylum,#country+name+origin,#country+name+asylum,#indicator+procedure_type,#indicator+procedure+name,#indicator+application_type+code,#indicator+application_type,#indicator+application_data_type,#indicator+application_data,#indicator+application_average_persons+num,#inneed+asylum_seekers+applications
1,2000,AFG,TUR,Afghanistan,Türkiye,U,UNHCR,V,Various,P,Persons,0,81
2,2000,DZA,TUR,Algeria,Türkiye,U,UNHCR,V,Various,P,Persons,0,5
3,2000,CHN,TUR,China,Türkiye,U,UNHCR,V,Various,P,Persons,0,11
4,2000,COG,TUR,Congo,Türkiye,U,UNHCR,V,Various,P,Persons,0,5


--------------------------------------------------

Dataset: Decisions from turkey
Number of rows: 902
Number of columns: 16
First few rows:


Unnamed: 0,Year,Country of Origin Code,Country of Asylum Code,Country of Origin Name,Country of Asylum Name,Procedure Type,Procedure Name,Decision Type Code,Decision Data Type,Decision Data,Decisions Average Persons Per Case,Recognized,Complementary Protection,Otherwise Closed,Rejected,Total Decided
0,#date+year,#country+code+origin,#country+code+asylum,#country+name+origin,#country+name+asylum,#indicator+procedure_type,#indicator+procedure+name,#indicator+decision_type+code,#indicator+decision_data_type,#indicator+decision_data,#indicator+decision_average_persons+num,#inneed+asylum_seekers+recognized,#inneed+asylum_seekers+recognized_other,#inneed+asylum_seekers+otherwise_closed,#inneed+asylum_seekers+rejected,#inneed+asylum_seekers+total_decided
1,2000,AFG,TUR,Afghanistan,Türkiye,U,UNHCR,FI,P,Persons,0,29,0,49,24,102
2,2000,DZA,TUR,Algeria,Türkiye,U,UNHCR,FI,P,Persons,0,0,0,5,0,5
3,2000,CHN,TUR,China,Türkiye,U,UNHCR,FI,P,Persons,0,14,0,0,0,14
4,2000,ERI,TUR,Eritrea,Türkiye,U,UNHCR,FI,P,Persons,0,0,0,0,14,14


--------------------------------------------------


In [13]:
# Data quality checks

def check_data_quality(name, df):
    print(f"\nDataset: {name}")
    print(f"Total duplicated rows: {df.duplicated().sum()}")
    print(f"Duplicate percentage: {(df.duplicated().sum() / len(df) * 100):.2f}%")
    
    print("\nColumn summary:")
    summary_columns = pd.DataFrame({
        "Type": df.dtypes,
        "Missing Values": df.isnull().sum(),
        "Missing %": (df.isnull().mean() * 100).round(2)
    })
    display(summary_columns)

for name, df in primary_datasets.items():
    check_data_quality(name, df)


Dataset: Applications to turkey
Total duplicated rows: 1
Duplicate percentage: 0.11%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,object,0,0.0
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Application Type Code,object,0,0.0
Application Type,object,0,0.0
Application Data Type,object,0,0.0



Dataset: Decisions from turkey
Total duplicated rows: 0
Duplicate percentage: 0.00%

Column summary:


Unnamed: 0,Type,Missing Values,Missing %
Year,object,0,0.0
Country of Origin Code,object,0,0.0
Country of Asylum Code,object,0,0.0
Country of Origin Name,object,0,0.0
Country of Asylum Name,object,0,0.0
Procedure Type,object,0,0.0
Procedure Name,object,0,0.0
Decision Type Code,object,0,0.0
Decision Data Type,object,0,0.0
Decision Data,object,0,0.0


In [14]:
for name, df in primary_datasets.items():
    print(f"\n{name}:")
    print(df.columns.tolist())


Applications to turkey:
['Year', 'Country of Origin Code', 'Country of Asylum Code', 'Country of Origin Name', 'Country of Asylum Name', 'Procedure Type', 'Procedure Name', 'Application Type Code', 'Application Type', 'Application Data Type', 'Application Data', 'Application Average Persons Per Case', 'Number of Applications']

Decisions from turkey:
['Year', 'Country of Origin Code', 'Country of Asylum Code', 'Country of Origin Name', 'Country of Asylum Name', 'Procedure Type', 'Procedure Name', 'Decision Type Code', 'Decision Data Type', 'Decision Data', 'Decisions Average Persons Per Case', 'Recognized', 'Complementary Protection', 'Otherwise Closed', 'Rejected', 'Total Decided']
