In [162]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import logging

##  Set up logging configuration

In [163]:
logging.basicConfig(
    filename="data_cleaning.log",
    level=logging.INFO,
    format="%(asctime)s - %(levelname)s - %(message)s"
)

def log_step(step_name):
    logging.info(f"Started: {step_name}")
    
def log_error(step_name, error):
    logging.error(f"Error in {step_name}: {error}")

def log_success(step_name):
    logging.info(f"Success: {step_name}")

##  Load the CSV file into a DataFrame

In [164]:
columns_to_keep = [
    "nom",
    "prenom",
    "date_naissance",
    "cin",
    "tel",
    "email",
    "diplome",
    "etablissment",
    "formation",
    "lettre_motivation",
    "etat",
    "viewed",
    "contacte",
    "inscrit",
    "created",
    "ville",
]
data = pd.read_csv("data.csv")

In [165]:
data = data[columns_to_keep]

In [None]:
data

## Define validation and correction functions

### nom and prenom columns

In [166]:
def is_valid_name(value):
    if not isinstance(value, str):
        return False
    pattern = re.compile(r"^[A-Za-z -]*$")
    return bool(pattern.match(value))


def correct_name(value):
    if is_valid_name(value):
        return value.strip().title()
    return None

### date_naissance column

In [167]:
def is_valid_date(value):
    try:
        pd.to_datetime(value, format="%Y-%m-%d", errors="raise")
        return True
    except (ValueError, TypeError):
        return False


def correct_date(value):
    if is_valid_date(value):
        return value
    else:
        return None

### cin column

In [168]:
def is_valid_cin(value):
    if not isinstance(value, str):
        return False
    pattern = re.compile(r"^[A-Za-z]{1,2}\d{3,}$")
    return bool(pattern.match(value))


def correct_cin(value):
    if is_valid_cin(value):
        return value.upper()
    else:
        return None

### tel column

In [169]:
def is_valid_tel(value):
    if not isinstance(value, str):
        return False
    pattern = re.compile(r"^\+2126\d{8}$")
    return bool(pattern.match(value))


def correct_tel(value):
    if is_valid_tel(value):
        return value
    pattern = re.compile(r"^6\d{8}$")
    if bool(pattern.match(value)) == True:
        return "+212" + value
    pattern = re.compile(r"^2126\d{8}$")
    if bool(pattern.match(value)) == True:
        return "+" + value
    return None

### email column

In [170]:
def is_valid_email(value):
    if not isinstance(value, str):
        return False
    pattern = re.compile(r"^[a-zA-Z0-9._-]+@[a-zA-Z]+\.[a-zA-Z]+$")
    return bool(pattern.match(value))


email_domains = [
    "@gmail.com",
    "@yahoo.com",
    "@outlook.com",
    "@hotmail.com",
    "@icloud.com",
    "@aol.com",
    "@protonmail.com",
    "@zoho.com",
    "@gmx.com",
    "@mail.com",
    "@yandex.com",
]


def correct_email(value):
    value = value.replace(" ", "")
    if is_valid_email(value):
        return value.lower()
    else:
        if value:
            if "@" in value:
                username, partial_domain = email.split("@", 1)
                partial_domain = "@" + partial_domain
                for domain in email_domains:
                    if domain.startswith(partial_domain):
                        return username + domain
            else:
                return value + "@gmail.com"
    return None

### diplome column

In [171]:
def is_valid_diplome(value):
    if not isinstance(value, str):
        return False
    pattern = re.compile(r"^bac\+\d$")
    return bool(pattern.match(value)) or value == "autre"


def correct_diplome(value):
    value = value.replace(" ", "").lower()
    if is_valid_diplome(value):
        return value
    else:
        return None

### created column

In [172]:
def is_valid_created(value):
    if not isinstance(value, str):
        return False
    try:
        pd.to_datetime(value, format="%Y-%m-%d %H:%M:%S", errors="raise")
        return True
    except ValueError:
        return False


def correct_created(value):
    if is_valid_created(value):
        return value
    else:
        return None

### etat, viewed, contacte and inscrit columns

In [173]:
def is_single_digit(value):
    pattern = re.compile(r"^\d$")
    return bool(pattern.match(value))


def correct_single_digit(value):
    if is_single_digit(value):
        return value
    else:
        return 0

### fill ville column

In [174]:
cin_city = {
    "A": "Rabat",
    "AA": "Rabat",
    "AC": "Rabat",
    "AJ": "Rabat",
    "AB": "Salé",
    "AE": "Salé",
    "AY": "Salé",
    "AS": "Salé",
    "AD": "Témara",
    "B": "Casablanca",
    "BA": "Casablanca",
    "BB": "Casablanca",
    "BE": "Casablanca",
    "BH": "Casablanca",
    "BJ": "Casablanca",
    "BK": "Casablanca",
    "BL": "Casablanca",
    "BM": "Casablanca",
    "BF": "Casablanca",
    "BV": "Casablanca",
    "BW": "Casablanca",
    "BX": "Moroccans residing abroad (MRE)",
    "DF": "Moroccans residing abroad (MRE)",
    "PK": "Moroccans residing abroad (MRE)",
    "PP": "Moroccans residing abroad (MRE)",
    "PS": "Moroccans residing abroad (MRE)",
    "PH": "Moroccans residing abroad (MRE)",
    "C": "Fez",
    "CC": "Fez",
    "CD": "Fez",
    "CB": "Sefrou",
    "CN": "Boulemane",
    "D": "Meknes",
    "DN": "Meknes",
    "DA": "Azrou",
    "DB": "Ifrane",
    "DC": "Moulay Idriss Zerhoun",
    "DJ": "Ain Taoujdate",
    "DN": "El Hajeb",
    "DO": "Ouislane",
    "E": "Marrakesh",
    "EE": "Marrakesh",
    "EA": "Ben Guerir",
    "F": "Oujda",
    "FA": "Berkane",
    "FB": "Taourirt",
    "FC": "El Aioun Sidi Mellouk",
    "FD": "Ain Bni Mathar",
    "FE": "Saïdia",
    "FG": "Figuig",
    "FH": "Jerada",
    "FJ": "Ahfir",
    "FK": "Touissit",
    "FL": "Bouarfa",
    "G": "Kenitra, Sidi Yahya El Gharb",
    "GA": "Sidi Slimane, Sidi Yahya El Gharb",
    "GB": "Souk El Arbaa",
    "GK": "Sidi Kacem",
    "GM": "Ouazzane",
    "GN": "Mechra Bel Ksiri",
    "GJ": "Jorf El Melha",
    "H": "Safi",
    "HH": "Safi",
    "HA": "Youssoufia",
    "I": "Beni Mellal",
    "IA": "Kasba Tadla",
    "IB": "Fquih Ben Saleh",
    "IC": "Azilal",
    "ID": "Souk Sebt Ould Nemma",
    "IE": "Demnate",
    "J": "Agadir",
    "JK": "Agadir",
    "JA": "Guelmim",
    "JB": "Inezgane, Dcheira El Jihadia",
    "JC": "Taroudant",
    "JD": "Sidi Ifni",
    "JE": "Tiznit",
    "JF": "Tan-Tan",
    "JH": "Chtouka Aït Baha",
    "JM": "Aït Melloul, Temsia, Lqliâa, Oulad Dahou",
    "JT": "Oulad Teima",
    "JY": "Tata",
    "JZ": "Assa-Zag",
    "K": "Tangier",
    "KB": "Tangier",
    "KA": "Asilah",
    "L": "Tétouan",
    "LA": "Larache",
    "LB": "Ksar el-Kebir",
    "LC": "Chefchaouen",
    "LE": "Martil",
    "LF": "Fnideq",
    "LG": "M'diq",
    "M": "El Jadida",
    "MA": "Azemmour",
    "MC": "Sidi Bennour",
    "MD": "Zemamra",
    "N": "Essaouira",
    "O": "Dakhla",
    "OD": "Dakhla",
    "P": "Ouarzazate",
    "PA": "Tinghir",
    "PB": "Zagora",
    "Q": "Khouribga",
    "QA": "Oued Zem",
    "R": "Al Hoceima",
    "RB": "Imzouren",
    "RC": "Targuist",
    "RX": "Bni Bouayach",
    "S": "Nador",
    "SA": "Nador",
    "SH": "Laayoune",
    "SJ": "Smara",
    "SK": "Tarfaya",
    "SL": "Boujdour",
    "T": "Mohammedia",
    "TA": "Benslimane",
    "TK": "Benslimane",
    "U": "Errachida",
    "UA": "Goulmima",
    "UB": "Er-Rich",
    "UC": "Erfoud",
    "UD": "Rissani",
    "V": "Khenifra",
    "VA": "Midelt, Itzer",
    "VM": "M'rirt",
    "W": "Settat",
    "WA": "Berrechid",
    "WB": "Ben Ahmed",
    "X": "Khemisset",
    "XA": "Tifelt",
    "Y": "Kalaat Sraghna",
    "Z": "Taza",
    "ZG": "Guercif",
    "ZH": "Karia Ba Mohamed",
    "ZT": "Taounate",
}


def get_ville(cin):
    if is_valid_cin(cin):
        cin_code = None
        pattern = re.compile(r"^[A-Za-z]+")
        match = pattern.match(cin)
        if match:
            cin_code = match.group(0).upper()
            return cin_city[cin_code]
    return None

In [175]:
correct_func = {
	"nom": correct_name,
	"prenom": correct_name,
	"date_naissance": correct_date,
	"cin": correct_cin,
	"tel": correct_tel,
	"email": correct_email,
	"diplome": correct_diplome,
	"created": correct_created,
	"viewed": is_single_digit,
	"contacte": is_single_digit,
	"inscrit": is_single_digit,
}

check_func = {
	"nom": is_valid_name,
	"prenom": is_valid_name,
	"date_naissance": is_valid_date,
	"cin": is_valid_cin,
	"tel": is_valid_tel,
	"email": is_valid_email,
	"diplome": is_valid_diplome,
	"created": is_valid_created,
	"viewed": correct_single_digit,
	"contacte": correct_single_digit,
	"inscrit": correct_single_digit,
}

In [185]:
column = 'nom'
data[column].apply(correct_func[column])

0         Er-Raki
1        Lihyaoui
2        Medioune
3        Elgzouli
4      Belkotaine
          ...    
142        Hamcha
143       Mastour
144        Bennah
145        Hidara
146        Fakkar
Name: nom, Length: 147, dtype: object

In [135]:
# logging.basicConfig(
#     filename="data_cleaning.log",
#     level=logging.INFO,
#     format="%(asctime)s:%(levelname)s:%(message)s",
# )


# def log_anomalies(anomaly_type, count, details):
#     """Log detected anomalies"""
#     logging.info(f"{anomaly_type} - Count: {count}")
#     logging.info(f"Details: {details}")

In [136]:
def check_missing_values(df):
    missing_values = df.isnull().sum()
    return missing_values

In [137]:

# def check_name_column(df, column_name):
# 	return df[column_name].apply(is_valid_name)

# def check_date_column(df, column_name):
#     return df[column_name].apply(is_valid_date)

# def check_cin_column(df, column_name):
#     return df[column_name].apply(is_valid_cin)

# def check_tel_column(df, column_name) :
#     return df[column_name].apply(is_valid_email)

# def check_email_column(df, column_name) :
#     return df[column_name].apply(is_valid_email)

# def check_diplome_column(df, column_name) :
#     return df[column_name].apply(is_valid_diplome)

# def check_created_column(df, column_name) :
#     return df[column_name].apply(is_valid_created)

# def check_single_digit_column(df, column_name):
#     return df[column_name].apply(is_single_digit)


## Apply anomaly detection and correction to columns

In [138]:
def check_name_column(df, column_name):
	return df[column_name].apply(is_valid_name)
def check_column(df, column_name) :
	

IndentationError: expected an indented block (4172942962.py, line 4)

In [139]:
data.loc[0, 'nom'] = 'bahi'

In [140]:
data

Unnamed: 0,nom,prenom,date_naissance,cin,tel,email,diplome,etablissment,formation,lettre_motivation,etat,viewed,contacte,inscrit,created,ville
0,bahi,MERYEM,1994-07-21,G664463,672619167,meryem.er-raki@outlook.fr,bac+3,UNIVERSITE IBN TOFAI,mi,A Monsieur le directeur de l’École National De...,0,1,1,0,2015-05-20 21:11:59,
1,Lihyaoui,Amine,0000-00-00,DA70851,634432230,amustube@gmail.com,bac+2,ISTA IFRANE,lar,Lihyaoui Amine 244 Bloc Elmassira 25000/khour...,0,1,1,0,2015-05-22 17:52:26,
2,MEDIOUNE,Badr,1991-04-03,BB 80536,622431159,badr.medioune@gmail.com,bac+2,ISTA Sidi Moumen,lar,"Monsieur, Titulaire du diplôme technicien spéc...",0,1,1,0,2015-05-23 15:00:16,
3,ELGZOULI,AMAl,1992-05-25,G634563,619957741,amale.elgzouli@gmail.com,bac+3,FST,mi,,0,1,1,0,2015-05-06 00:46:07,
4,belkotaine,manal,1995-07-03,ID64786,635531363,witch_girl16@hotmail.com,bac+2,ISTA NTIC Beni-Mella,lar,,0,1,1,0,2015-05-07 15:40:07,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
142,hamcha,abdrrahman,1969-12-30,i245113,661414268,a.hamcha@africlight.ma,bac+2,ista beni mellal,MGP,bonjour,0,1,1,0,2015-10-28 13:12:27,
143,mastour,amina,1993-03-21,gh679863,694950654,imdoha@gmail.com,bac+2,ista ouedzem,LRT,ddd,1,0,1,0,2015-10-28 18:07:02,
144,bennah,khalil,1993-11-04,OD44737,649278532,khalilbennah@gmail.com,bac+3,faculté polydisciplinaire Errachidia,mi,"Après avoir obtenu mon ""diplôme de Licence pro...",0,0,0,0,2015-11-04 14:25:14,
145,HIDARA,ADIL,1986-02-07,Q260030,615227007,hid.adil@gmail.com,bac+2,ISTA & ENMIG,lil,Très motivé pour continuer mes etude en ce dom...,0,1,0,0,2015-11-11 17:58:45,


In [142]:
for column in data.columns:
    if column in check_func :
        anomalies = data[column].apply(check_func[column])
        if anomalies.any():
            # logging.info(f"Anomalies detected in column: {column}. Applying correction function.")
            data[column] = data[column].apply(correct_func[column])
        else:
            pass
    else :
        pass	


TypeError: expected string or bytes-like object