In [2]:
import hashlib
import re
from datetime import datetime, timezone
import pandas as pd

In [3]:
EXCLUSION_LIST = ['BLANK', '-', 'NA', 'NONE', '{NULL}']

In [4]:
# clean column names
def clean_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
    return df

In [5]:
def clean_dob(dob, age):
    if dob > datetime.now() and age < 100:
        dob = dob.replace(year=datetime.now().year - age)
    if age >= 100 and dob.year > datetime.now().year - 100:
        dob = dob.replace(year=datetime.now().year - age)
    return dob

In [6]:
def clean_column(value, exclusions=EXCLUSION_LIST):
    if isinstance(value, str) and value.strip().upper() in exclusions:
        return None
    elif isinstance(value, float) and pd.isna(value):
        return None
    else:
        return value

In [7]:
def hash_password(pw: str) -> str:
    if pw is None:
        return None
    else:
        return hashlib.sha256(pw.encode('utf-8')).hexdigest()

In [8]:
def clean_salary(salary: str) -> str:
    if salary is None:
        return None
    else:
        return re.sub(r'[^\d.]', '', salary)

In [None]:
users_uk = pd.read_csv('data/UK User Data.csv', encoding='latin1')
users_uk = clean_cols(users_uk)
users_uk['dob'] = pd.to_datetime(users_uk['dob'], format='%d/%m/%y', errors='coerce')
users_uk.head()

In [31]:
from psutil import users


users_fr = pd.read_csv('data/FR User Data.csv', encoding='utf8')
users_fr = clean_cols(users_fr)
users_fr['dob'] = pd.to_datetime(users_fr['ddn'], format='%y/%m/%d', errors='coerce')
users_fr.head()




Unnamed: 0,prénom,nom_de_famille,ddn,âge_dernier_anniversaire,couleur_préférée,animal_préféré,plat_préféré,genre,mot_de_passe,ville,département,code_postal,adresse_électronique,téléphone,portable,bac+,salaire,visites_du_site_web_au_cours_des_30_derniers_jours,dob
0,Adèle Françoise,Bisset,16-10-01,108,Jaune,Tigre,Ratatouille,F,BUXe$E2Y/4+mX!J,Villevenard,Marne,51270,bisset16@live.com,03 26 80 52 40,06 11 53 00 93,Baccalauréat,"1.581,00",17,NaT
1,Adrien Jacques,Abadie,85-05-05,39,Bleu,Cheval,Cassoulet,M,t4BPtPe.Nis/EJS,Lille,Nord,59800,ajabadie@outlook.com,03 20 15 84 40,06 81 43 00 10,Licentiate,"2.979,50",25,NaT
2,Bruno Jean-Baptiste,Chevrolet,30-06-26,94,Gris,Mouton,Quiche lorraine,M,"68,cj%L4wALVksu",Tarbes,Hautes-Pyrénées,65000,bjbchevy30@live.com,05 62 34 32 36,06 88 76 27 26,Baccalauréat,"1.058,00",29,NaT
3,Cassandre,Fortier,02-03-01,22,Marron,Poule,Crêpes,vide,"vXE,E!9dK,cq4_2",Béziers,Hérault,34500,fortier02@webmail.free.fr,04 67 36 73 73,06 77 70 77 03,Master,"3.785,50",44,NaT
4,Ugène,Gagnon,48-01-05,77,Rouge,Cochon,Bouillabaisse,-,?de/7C9eJ?SdmsZ,Créteil,Val-de-Marne,94000,rougecouchon@mail.ru,01 83 75 56 56,06 01 00 00 69,CFA,"1.581,00",7,NaT


In [30]:
list(users_uk.columns)


['first_name',
 'surname',
 'middle_initials',
 'dob',
 'age_last_birthday',
 'favourite_colour',
 'favourite_animal',
 'favourite_food',
 'gender',
 'password',
 'city',
 'county',
 'postcode',
 'email',
 'phone',
 'mobile',
 'rqf',
 'salary',
 'website_visits_last_30_days']

In [33]:


fr_columns = list(users_uk.columns)
fr_columns.remove('middle_initials')
fr_columns[-3] = 'education' 

users_fr.columns = users_uk.columns 

In [34]:
users_fr.head()

Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days
0,Adèle Françoise,Bisset,16-10-01,108,Jaune,Tigre,Ratatouille,F,BUXe$E2Y/4+mX!J,Villevenard,Marne,51270,bisset16@live.com,03 26 80 52 40,06 11 53 00 93,Baccalauréat,"1.581,00",17,NaT
1,Adrien Jacques,Abadie,85-05-05,39,Bleu,Cheval,Cassoulet,M,t4BPtPe.Nis/EJS,Lille,Nord,59800,ajabadie@outlook.com,03 20 15 84 40,06 81 43 00 10,Licentiate,"2.979,50",25,NaT
2,Bruno Jean-Baptiste,Chevrolet,30-06-26,94,Gris,Mouton,Quiche lorraine,M,"68,cj%L4wALVksu",Tarbes,Hautes-Pyrénées,65000,bjbchevy30@live.com,05 62 34 32 36,06 88 76 27 26,Baccalauréat,"1.058,00",29,NaT
3,Cassandre,Fortier,02-03-01,22,Marron,Poule,Crêpes,vide,"vXE,E!9dK,cq4_2",Béziers,Hérault,34500,fortier02@webmail.free.fr,04 67 36 73 73,06 77 70 77 03,Master,"3.785,50",44,NaT
4,Ugène,Gagnon,48-01-05,77,Rouge,Cochon,Bouillabaisse,-,?de/7C9eJ?SdmsZ,Créteil,Val-de-Marne,94000,rougecouchon@mail.ru,01 83 75 56 56,06 01 00 00 69,CFA,"1.581,00",7,NaT


Headers
Fix doB so it's consistent
map gender to conform to uk
check location matches UK format
telephone number spaces
BAC+ mapping to RQF
Salary is in euros, . and , are swapped. monthly instead of annually

In [21]:
# clean columns
for col in users.columns:
    users[col] = users[col].apply(clean_column)
# hash passwords
users['password'] = users['password'].apply(hash_password)
# clean dob
users['dob'] = users.apply(lambda x: clean_dob(x['dob'], x['age_last_birthday']), axis=1)
# clean salary
users['salary'] = users['salary'].apply(clean_salary)

In [22]:
# import other file
# 'Website Visits' validation

In [23]:
def clean_timestamp(ts):
    return datetime.fromtimestamp(ts, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')

In [43]:
logins = pd.read_csv('data/UK-User-LoginTS.csv')
logins = clean_cols(logins)
logins['logints'] = logins['logints'].apply(clean_timestamp)
logins.columns = ['login_id', 'username', 'login_timestamp']
logins.head()

Unnamed: 0,login_id,username,login_timestamp
0,1,card49a@gmail.com,2025-01-05 10:12:40
1,2,card49a@gmail.com,2025-01-09 20:39:23
2,3,card49a@gmail.com,2025-01-14 06:52:53
3,4,card49a@gmail.com,2025-01-18 17:10:01
4,5,card49a@gmail.com,2025-01-23 03:28:32


In [29]:
# quick validation
# logins.isna().sum()
# logins.info()

In [44]:
import sqlite3

print(sqlite3.sqlite_version)

3.50.4


In [45]:
import sqlite3

conn = sqlite3.connect("customers.db")

with open("create_database.sql", "r", encoding="utf-8") as f:
    sql_script = f.read()

conn.executescript(sql_script)
conn.commit()
conn.close()


In [46]:
conn = sqlite3.connect("customers.db")
users.to_sql('users', conn, if_exists='append', index=False)
logins.to_sql('logins', conn, if_exists='append', index=False)

conn.close()