In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import logging

In [2]:
data = pd.read_csv('data.csv')


In [4]:
logging.basicConfig(
    filename='data_cleaning.log',
    level=logging.INFO,
    format='%(asctime)s:%(levelname)s:%(message)s'
)

def log_anomalies(anomaly_type, count, details):
    """Log detected anomalies"""
    logging.info(f'{anomaly_type} - Count: {count}')
    logging.info(f'Details: {details}')

In [5]:
def check_missing_values(df):
    missing_values = df.isnull().sum()
    return missing_values

In [None]:
def is_valid_name(value):
        if not isinstance(value, str):
            return False
        for char in value:
            if not (char.isalpha() or char in [' ', '-']):
                return False
        return True
def is_valid_date(value):
    try:
        pd.to_datetime(value, format='%Y-%m-%d', errors='raise')
        return True
    except (ValueError, TypeError):
        return False

        
def is_valid_cin(value):
    if not isinstance(value, str):
        return False
    pattern = re.compile(r'^[A-Za-z]{1,2}\d{3,}$')
    return bool(pattern.match(value))

def is_valid_tel(value) :
    if not isinstance(value, str):
        return False
    pattern = re.compile(r'^\+2126\d{8}$')
    return bool(pattern.match(value))

def is_valid_email(value):
    if not isinstance(value, str):
        return False
    pattern = re.compile(r'^[a-zA-Z0-9._]+@[a-zA-Z]+\.[a-zA-Z]+$')
    return bool(pattern.match(value))

def is_valid_diplome(value) :
    if not isinstance(value, str):
        return False
    pattern = re.compile(r'^bac\+\d$')
    return bool(pattern.match(value)) or value == 'autre'

def is_valid_created(value) :
    if not isinstance(value, str):
        return False
    try:
        pd.to_datetime(value, format='%Y-%m-%d %H:%M:%S', errors='raise')
        return True
    except ValueError:
        return False

def is_single_digit(value):

    if not isinstance(value, str):
        return False
    pattern = re.compile(r'^\d$')
    return bool(pattern.match(value))


In [None]:
def correct_name(value):
    if is_valid_name(value):
        return value.upper()
    else:
        return None

def correct_date(value):
    if is_valid_date(value):
        return value
    else:
        return None
        
def correct_cin(value):
    if is_valid_cin(value):
        return value.upper()
    else:
        return None


def correct_tel(value) :
    
    if is_valid_tel(value):
        return value
    pattern = re.compile(r'^6\d{8}$')
    if bool(pattern.match(value)) == True:
        return '+212'+value
    pattern = re.compile(r'^2126\d{8}$')
    if bool(pattern.match(value)) == True:
        return '+'+value
    return None

    

def correct_email(value):
    value = value.replace(' ', '')
    email_domains = [
        "@gmail.com",
        "@yahoo.com",
        "@outlook.com",
        "@hotmail.com",
        "@icloud.com",
        "@aol.com",
        "@protonmail.com",
        "@zoho.com",
        "@gmx.com",
        "@mail.com",
        "@yandex.com"
    ]
    if is_valid_email(value):
        return value.lower()
    else:
        if value :
            if '@' in value:
                username, partial_domain = email.split("@", 1)
                partial_domain = "@" + partial_domain
                for domain in email_domains:
                    if domain.startswith(partial_domain):
                        return username + domain
            else:
                return value + "@gmail.com"
    return None

def correct_diplome(value) :
    value = value.replace(' ', '')
    if is_valid_diplome(value):
        return value
    else:
        return None

def correct_created(value) :


def is_single_digit(value):
    if not isinstance(value, str):
        return False
    pattern = re.compile(r'^\d$')
    return bool(pattern.match(value))

In [7]:
def check_name_column(df, column_name):
	return df[column_name].apply(is_valid_name)

def check_date_column(df, column_name):
    return df[column_name].apply(is_valid_date)

def check_cin_column(df, column_name):
    return df[column_name].apply(is_valid_cin)

def check_tel_column(df, column_name) :
    return df[column_name].apply(is_valid_email)

def check_email_column(df, column_name) :
    return df[column_name].apply(is_valid_email)

def check_diplome_column(df, column_name) :
    return df[column_name].apply(is_valid_diplome)

def check_created_column(df, column_name) :
    return df[column_name].apply(is_valid_created)

def check_single_digit_column(df, column_name):
    return df[column_name].apply(is_single_digit)
    

In [8]:
columns_to_keep = ['nom','prenom','date_naissance','cin','tel','email','diplome','etablissment','formation','lettre_motivation','etat','viewed','contacte','inscrit','created']
data = data[columns_to_keep]

In [None]:
data

In [29]:
print(check_created_column(data, 'created'))

0      True
1      True
2      True
3      True
4      True
       ... 
142    True
143    True
144    True
145    True
146    True
Name: created, Length: 147, dtype: bool


In [None]:
def get_data(data):
	data = data.dropna()
	data = data.drop_duplicates()
	data = data.reset_index(drop=True)
	return data