# Data Cleaning Workshop
This notebook implements data cleaning for candidate registration data.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import logging

In [2]:
# Load the data
df = pd.read_csv('data.csv')
print('Data shape:', df.shape)
df.head()

Data shape: (148, 33)


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32
0,nom,prenom,date_naissance,cin,tel,email,diplome,etablissment,formation,lettre_motivation,...,,,,,,,,,,
1,ER-RAKI,MERYEM,1994-07-21,G664463,672619167,meryem.er-raki@outlook.fr,bac+3,UNIVERSITE IBN TOFAI,mi,A Monsieur le directeur de l’École National De...,...,,,,,,,,,,
2,Lihyaoui,Amine,0000-00-00,DA70851,634432230,amustube@gmail.com,bac+2,ISTA IFRANE,lar,Lihyaoui Amine 244 Bloc Elmassira 25000/khour...,...,,,,,,,,,,
3,MEDIOUNE,Badr,1991-04-03,BB 80536,622431159,badr.medioune@gmail.com,bac+2,ISTA Sidi Moumen,lar,"Monsieur, Titulaire du diplôme technicien spéc...",...,,,,,,,,,,
4,ELGZOULI,AMAl,1992-05-25,G634563,619957741,amale.elgzouli@gmail.com,bac+3,FST,mi,,...,,,,,,,,,,


In [None]:
def detect_missing_values(df):
    """Detect missing values in dataframe"""
    missing = df.isnull().sum()
    return missing[missing > 0]

def detect_date_format_errors(df, date_cols):
    """Detect invalid dates"""
    errors = {} 
    for col in date_cols:
        mask = pd.to_datetime(df[col], errors='coerce').isnull()
        errors[col] = df[mask][col].tolist()
    return errors

def detect_email_format_errors(df, email_col):
    """Detect invalid email formats"""
    email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    mask = ~df[email_col].str.match(email_pattern)
    return df[mask][email_col].tolist()

def detect_phone_format_errors(df, phone_col):
    """Detect invalid phone numbers"""
    phone_pattern = r'^[0-9]{9,10}$'
    mask = ~df[phone_col].astype(str).str.match(phone_pattern)
    return df[mask][phone_col].tolist()

In [4]:
def correct_dates(df, date_cols):
    """Correct date formats"""
    for col in date_cols:
        df[col] = pd.to_datetime(df[col], errors='coerce')
    return df           

def correct_phone_numbers(df, phone_col):
    """Standardize phone number format"""
    df[phone_col] = df[phone_col].astype(str).str.replace(r'[^0-9]', '')
    return df

def correct_email_case(df, email_col):
    """Convert emails to lowercase"""
    df[email_col] = df[email_col].str.lower()
    return df

In [5]:
# Setup logging
logging.basicConfig(
    filename='data_cleaning.log',
    level=logging.INFO,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

def log_anomalies(anomaly_type, count, details):
    """Log detected anomalies"""
    logging.info(f'{anomaly_type} - Count: {count}')
    logging.info(f'Details: {details}')

In [6]:
# Execute cleaning workflow
date_cols = ['date_naissance', 'created']

# Detect anomalies
missing_vals = detect_missing_values(df)
date_errors = detect_date_format_errors(df, date_cols)
email_errors = detect_email_format_errors(df, 'email')
phone_errors = detect_phone_format_errors(df, 'tel')

# Log anomalies
log_anomalies('Missing Values', len(missing_vals), missing_vals)
log_anomalies('Date Format Errors', sum(len(v) for v in date_errors.values()), date_errors)
log_anomalies('Email Format Errors', len(email_errors), email_errors)
log_anomalies('Phone Format Errors', len(phone_errors), phone_errors)

# Apply corrections
df = correct_dates(df, date_cols)
df = correct_phone_numbers(df, 'tel')
df = correct_email_case(df, 'email')

# Save cleaned data
df.to_csv('cleaned_data.csv', index=False)
print('Cleaning complete - see data_cleaning.log for details')

KeyError: 'date_naissance'