# Principal Data Science Task Template

**Author:** Leila Yousefi   
**Date:** 24/07/2025  ({{ today().strftime("%Y-%m-%d") }}
**Objective:** Briefly restate the problem.

## 1. Installations & Imports

## 2. Data pre-processing
### 2.1. load csv file into a dataframe
### 2.2. Summary statistics
### 2.3 Data Quality Checks & Solutions
#### 2.3.1 Validation
#### 2.3.2 Completeness
#### 2.3.3 Uniqueness

## 3. Exploratory Data Analysis
### 3.1 Univariate distributions
### 3.2 Bivariate relationships

## 4. Feature Engineering & Modelling
### 4.1 Train/test split


## 5. Evaluation & Next Steps


In [None]:
# 1. Installations & Imports: Adjust or add libraries as needed for the task.

# suppress that specific package RuntimeWarning
import warnings
warnings.filterwarnings(
    "ignore",
    category=RuntimeWarning,
    message=".*invalid value encountered in cast.*"
)

# standard libs
import os
import sys
from datetime import datetime

# data libs
import pandas as pd
import numpy as np

# viz libs
import matplotlib.pyplot as plt

# modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# reproducibility
RANDOM_STATE = 42

# Working directory
print("Working directory:", os.getcwd())
print("Notebooks are here:", os.listdir())

# set paths
DATA_DIR = os.path.join("..", "data", "raw")
print("DATA_DIR:", DATA_DIR)
OUTPUT_DIR = os.path.join("..", "data", "processed")
print("OUTPUT_DIR:", OUTPUT_DIR)

In [None]:
# 2. Data pre-processing: Point the filepaths to data/raw/ and load data.

### 2.1. load csv file into a dataframe
filename = 'pre2018_linked_inv_lpa_data.csv'
df = pd.read_csv(os.path.join(DATA_DIR, filename), low_memory=False)

# Display the first few records
df.head()

### 2.2 Summary statistics & missing values
df.info()
df.describe(include="all")


### 2.3 Data Quality Checks & Solutions:

#### 2.3.1 Validation: **Correct format** 

#### 2.3.2 Completeness: **Decisions on missing data**  
- Column dates → drop rows (where both dates are missing)
- Column X → make derieved id to detect and delete duplicates 
- Column Y → impute median  

#### 2.3.3 Uniqueness: **Decisions onduplicates** 


In [None]:
#### 2.3.1 Validation: **Correct format** 
# Convert to correct format 
for col in ['registrationdate', 'date_received_in_opg']:
    df[col] = pd.to_datetime(df[col], errors='coerce', dayfirst=True) # force an out-of-bounds date to NaT, in addition to forcing non-dates (or non-parseable dates) to NaT
# parses dates with the day first, e.g. "10/11/12" is parsed as 2012-11-10, yearfirst=True is not strict, but will prefer to parse with year first.

# Count number of missing records based on missing values in 'registrationdate' 'date_received_in_opg'
n_reg_missing = df['registrationdate'].isna().sum()
n_opg_missing = df['date_received_in_opg'].isna().sum()
print(f"Missing registrationdate: {n_reg_missing}")
print(f"Missing date_received_in_opg: {n_opg_missing}")

# derive and Define year_month for monthly grouping
df['year_month'] = df['date_received_in_opg'].dt.to_period('M').dt.to_timestamp()
#df['year'] = df['date_received_in_opg'].dt.to_period('Y').dt.to_timestamp()
df['year'] = df['date_received_in_opg'].dt.year
df['month'] = df['date_received_in_opg'].dt.month
df['day'] = df['date_received_in_opg'].dt.day

# Compute delay in days
df['delay_days'] = (df['date_received_in_opg'] - df['registrationdate']).dt.days
# compute “delay in days” and then fill any missing delays with the mean delay for that calendar year 
# (falling back to the overall mean only if an entire year-group is empty):

# Count number of missing records based on missing values in 'delay_days' 
n_delays_missing = df['delay_days'].isna().sum()
print(f"Missing delays: {n_delays_missing}")
delays_missing_ids = df[df['delay_days'].isna()]['case_no']
#print("delays_missing_ids: ", delays_missing_ids)

df['delay_year'] = (
    df['registrationdate'].dt.year
    .fillna(df['date_received_in_opg'].dt.year)
    .astype(int)
)

# Pick a “year” to group on. Use registration‐year if present, otherwise receipt‐year.

# Impute missing delays with the mean for that year
df['delay_days'] = (
    df
    .groupby('delay_year')['delay_days']
    .transform(lambda s: s.fillna(s.mean()))
)

# If an entire year had only missing delays, fill those with the overall mean
overall_mean = df['delay_days'].mean()
df['delay_days'] = df['delay_days'].fillna(overall_mean)

# Count number of missing records based on missing values in 'delay_days' 
n_delays_missing = df['delay_days'].isna().sum()
print(f"Missing delays: {n_delays_missing}")

imputed_delays_days = df[df['case_no'].isin(delays_missing_ids)]['delay_days']
print(f"imputed delays (per day): {imputed_delays_days}")

print(f"imputed df: {df}")

# clean up (Optional) 
df.drop(columns=['delay_year'], inplace=True)

In [None]:
#### 2.3.2 Completeness: **Decisions on missing data** 
# Missing Data Imputation: Drop rows missing key dates
df = df[df['registrationdate'].notna() & df['date_received_in_opg'].notna()]

#### 2.3.2 Uniqueness: **Decisions onduplicates:**  
# Remove duplicates
# Build hybrid unique ID and remove duplicate
def make_derived_id(row):
    if pd.notna(row['case_no']) and str(row['case_no']).strip():
        return f"{row['case_no']}_{row['date_received_in_opg'].strftime('%Y%m%d')}"
    return str(row['unique_id'])

df['derived_id'] = df.apply(make_derived_id, axis=1)
df = df.drop_duplicates(subset='derived_id')

# Display processed dataframe
print("The first few records:", df.head(5))
print("The last few records:", df.tail(5))

In [None]:
#### 2.3.4 Accuracy: **measures the correctness of the content of data** 
# Establish which attributes of the data are required and 
# design the logic used to test them based on the business requirement. 
# Consistency is part of Accuracy

In [None]:
# 3. Exploratory Data Analysis: Insert code cells for plots and summary statistics.


# Define the target variables among the columns
df["target"] = df['delay_days']
# df["target"] = df["concern_type"]

### 3.1 Univariate distributions
fig, ax = plt.subplots()
df["target"].value_counts().plot(kind="bar", ax=ax)
plt.title("Target distribution")


# ### 3.2 Bivariate relationships
# plt.scatter(df["feature1"], df["feature2"])
# plt.xlabel("feature1")
# plt.ylabel("feature2")
# plt.show()


In [None]:
# 4. Feature Engineering & Modelling: Develop pipelines under the specified headings and record decisions in Markdown.
### 4.1 Train/test split
X = df.drop("target", axis=1)
y = df["target"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)


In [None]:
### 4.2 Preprocessing pipelines
num_feats = X.select_dtypes(include=["int64","float64"]).columns
cat_feats = X.select_dtypes(include=["object","category"]).columns

num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_feats),
    ("cat", cat_pipeline, cat_feats),
])


In [None]:
### 4.3 Baseline model: Logistic Regression
baseline = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(random_state=RANDOM_STATE)),
])
scores = cross_val_score(baseline, X_train, y_train, cv=5, scoring="roc_auc")
print("Baseline AUC:", scores.mean())


In [None]:
### 4.4 Advanced model: Random Forest
rf = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)),
])
scores_rf = cross_val_score(rf, X_train, y_train, cv=5, scoring="roc_auc")
print("RF AUC:", scores_rf.mean())


In [None]:
# 5. Evaluation & Next Steps: Clearly report metrics, visualizations, and recommended follow‑up actions.

### 5.1 Final test performance
rf.fit(X_train, y_train)
y_pred = rf.predict_proba(X_test)[:,1]
from sklearn.metrics import roc_auc_score, classification_report
print("Test AUC:", roc_auc_score(y_test, y_pred))
print(classification_report(y_test, rf.predict(X_test)))



### 5.2 Insights & Recommendations
- **Key finding 1:** …
- **Key finding 2:** …
- **Limitations:** data quality, potential biases
- **Next steps:** hyper-parameter tuning, fairness audit, productionize pipeline
