In [1]:
import hashlib
import re
from datetime import datetime, timezone
import pandas as pd
import sqlite3

In [2]:
EXCLUSION_LIST = ['BLANK', '-', 'NA', 'NONE', '{NULL}', 'VIDE']

In [3]:
# clean column names
def clean_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
    return df

In [4]:
def clean_dob(dob, age):
    if dob > datetime.now() and age < 100:
        dob = dob.replace(year=datetime.now().year - age)
    if age >= 100 and dob.year > datetime.now().year - 100:
        dob = dob.replace(year=datetime.now().year - age)
    return dob

In [5]:
def clean_column(value, exclusions=EXCLUSION_LIST):
    if isinstance(value, str) and value.strip().upper() in exclusions:
        return None
    elif isinstance(value, float) and pd.isna(value):
        return None
    else:
        return value

In [6]:
def hash_password(pw: str) -> str:
    if pw is None:
        return None
    else:
        return hashlib.sha256(pw.encode('utf-8')).hexdigest()

In [10]:
def clean_salary(salary: str, period:int=1) -> float:
    if salary is None:
        return None
    else:
        salary = (int(re.sub(r'[^\d]', '', salary))/100) * period
        return salary

In [11]:
users_uk = pd.read_csv('data/UK User Data.csv', encoding='latin1')
users_uk = clean_cols(users_uk)
users_uk['dob'] = pd.to_datetime(users_uk['dob'], format='%d/%m/%y', errors='coerce')
users_uk.head()

Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days
0,Derek,Card,A,2065-01-07,60,Red,Elephant,Bangers and Mash,Male,Parishaggis17%,Arundel,West Sussex,BN18 9PA,card49a@gmail.com,01903 882543,07787 557197,3,"19,500.00",7
1,David,Button,none,1999-08-22,25,Green,Giraffe,Cottage Pie,Male,CarTrain 56$,Bath,Avon,BA1 2QZ,button76@outlook.com,01225 413106,07961 102199,4,"21,000.00",15
2,Ian,Smythe,JO,2025-01-03,100,Blue,Cat,Toad in the Hole,blank,1945Tank*,Chester,Cheshire,CH2 1EU,long.65.morning@icloud.com,01244 380280,07594 146913,5,"23,000.00",28
3,Samantha,Jones,D,1991-03-24,33,Indigo,Wolf,Roast,Female,Yorkshire!3Pig,Dursley,Gloucestershire,GL11 4CD,busybusy@yahoo.com,01453 580136,07577 752530,6,"32,500.00",34
4,Wendy,Brown,L,2014-01-29,11,Pink,Puppy,Fish and Chips,Female,Snoopy78Peanut!,Frome,Somerset,BA11 7RT,brownsheep@flock.com,01373 253333,07768 852327,-,na,3


In [12]:
# clean columns
for col in users_uk.columns:
    users_uk[col] = users_uk[col].apply(clean_column)
# hash passwords
users_uk['password'] = users_uk['password'].apply(hash_password)
# clean dob
users_uk['dob'] = users_uk.apply(lambda x: clean_dob(x['dob'], x['age_last_birthday']), axis=1)
# clean salary
users_uk['salary'] = users_uk['salary'].apply(clean_salary)
users_uk

Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days
0,Derek,Card,A,1965-01-07,60,Red,Elephant,Bangers and Mash,Male,5e30d824b17bd930b9280c126a717d59ccdb4cd05aa8ee...,Arundel,West Sussex,BN18 9PA,card49a@gmail.com,01903 882543,07787 557197,3.0,19500.0,7
1,David,Button,,1999-08-22,25,Green,Giraffe,Cottage Pie,Male,22aa055adf8caa10b761514ffed59044adbc14a363c34c...,Bath,Avon,BA1 2QZ,button76@outlook.com,01225 413106,07961 102199,4.0,21000.0,15
2,Ian,Smythe,JO,1925-01-03,100,Blue,Cat,Toad in the Hole,,1d82e587a6c6a44b1833e2a1ce7460a1ae0b74ca24afc5...,Chester,Cheshire,CH2 1EU,long.65.morning@icloud.com,01244 380280,07594 146913,5.0,23000.0,28
3,Samantha,Jones,D,1991-03-24,33,Indigo,Wolf,Roast,Female,3bedb97c70c5ae128ef084645556bfbcf4572dde3e028d...,Dursley,Gloucestershire,GL11 4CD,busybusy@yahoo.com,01453 580136,07577 752530,6.0,32500.0,34
4,Wendy,Brown,L,2014-01-29,11,Pink,Puppy,Fish and Chips,Female,59700b2f9a7569c7a4e3862b29e4b04806714c79acaabf...,Frome,Somerset,BA11 7RT,brownsheep@flock.com,01373 253333,07768 852327,,,3
5,Jude,Thomas,,1952-10-06,73,Black,Badger,Curry,Male,0cf67c5ec09b4211deea15515beea2485d96e2d80a1566...,Ipswich,Suffolk,IP1 2DA,thomasold@gmail.com,01473 712233,07570 282737,2.0,11541.9,16
6,Blake,Abney-James,,2008-10-02,16,Teal,Goose,Pizza,Female,bac8c6138fac3aade4b2ed077a25a7fb73856d3e99f49c...,Andover,Hampshire,SP10 2EA,abneyallseeing@outlook.com,01264 338733,07812 132687,2.0,1331.2,22
7,Indigo,Pearce,Y,1955-07-25,70,Grey,Crab,Curry,Non-binary,9fca73975f6e7db416bde669f4360a35647b918db570ec...,Rhyl,Clwyd,LL18 1AS,junk@icloud.com,01745 344567,03301 623763,6.0,33000.0,42
8,Rowan,Weaver,,1973-08-25,51,Cyan,Cow,Crumpets,,be1cd42a7a307da7fdc0f01eadfc7384edbcdbfcae8128...,Warminster,Wiltshire,BA12 9BT,myotheraddress@gmail.com,01985 068271,07305 268271,7.0,41275.0,52
9,Jordan,Mayfield,,1975-11-14,49,Violet,Beaver,Pie and Chips,Prefer not to answer,397f623fe2e928e1a455e6ae2985ad4082824a9d1b7908...,Yelverton,Devon,PL20 6DT,mayfield_all@gmail.com,01822 618440,07903 438339,8.0,52370.0,29


In [13]:
fr_columns = list(users_uk.columns)
fr_columns.remove('middle_initials')
fr_columns[-3] = 'education'

In [14]:
fr_mapping_gender = {'F':'Female', 'M': 'Male', 'NB': 'Non-Binary'}

In [15]:
def map_gender(df:pd.DataFrame, mapping:dict):
    df['gender'] = df['gender'].map(mapping)
    return df

In [16]:
def clean_phone_number(df:pd.DataFrame, column:str) -> pd.DataFrame:
    df[column] = df[column].str.replace(' ', '')
    return df

In [17]:
users_fr = pd.read_csv('data/FR User Data.csv')
users_fr.columns = fr_columns
users_fr['dob'] = pd.to_datetime(users_fr['dob'], format='%y-%m-%d', errors='coerce')
users_fr.head(10)

Unnamed: 0,first_name,surname,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,education,salary,website_visits_last_30_days
0,Adèle Françoise,Bisset,2016-10-01,108,Jaune,Tigre,Ratatouille,F,BUXe$E2Y/4+mX!J,Villevenard,Marne,51270,bisset16@live.com,03 26 80 52 40,06 11 53 00 93,Baccalauréat,"1.581,00",17
1,Adrien Jacques,Abadie,1985-05-05,39,Bleu,Cheval,Cassoulet,M,t4BPtPe.Nis/EJS,Lille,Nord,59800,ajabadie@outlook.com,03 20 15 84 40,06 81 43 00 10,Licentiate,"2.979,50",25
2,Bruno Jean-Baptiste,Chevrolet,2030-06-26,94,Gris,Mouton,Quiche lorraine,M,"68,cj%L4wALVksu",Tarbes,Hautes-Pyrénées,65000,bjbchevy30@live.com,05 62 34 32 36,06 88 76 27 26,Baccalauréat,"1.058,00",29
3,Cassandre,Fortier,2002-03-01,22,Marron,Poule,Crêpes,vide,"vXE,E!9dK,cq4_2",Béziers,Hérault,34500,fortier02@webmail.free.fr,04 67 36 73 73,06 77 70 77 03,Master,"3.785,50",44
4,Ugène,Gagnon,2048-01-05,77,Rouge,Cochon,Bouillabaisse,-,?de/7C9eJ?SdmsZ,Créteil,Val-de-Marne,94000,rougecouchon@mail.ru,01 83 75 56 56,06 01 00 00 69,CFA,"1.581,00",7
5,José-Maria,Lamar,2011-10-13,13,Vert,Lapin,Chocolate soufflé,F,F8%cM3?sjQP@JnY,Poitiers,Vienne,86000,lapinfou67@list-manage.com,05 49 88 12 34,06 95 83 13 62,Collège,na,14
6,Sacha,Martel,2008-11-02,16,Rose,Poisson Rouge,Tarte Tatin,NB,i9/_yz&3mG4+Za$,Vannes,Morbihan,56000,sachalepoisson@live.com,02 97 54 34 34,07 88 15 75 58,Lycée,12500,32
7,Elvire Françoise,Sartre,2063-02-11,62,Noir,Souris,Croque monsieur,F,gSNzDVa?rur2GT5,Nevers,Nièvre,58000,noirsartre@outlook.com,03 86 36 15 15,07 89 63 13 57,Doctorat,"4.800,00",24
8,Émile Jean,Travers,1993-02-09,32,Argent,Cerf,Coq au vin,M,"?nEz?@x,C$6wK*@",Arras,Pas-de-Calais,62000,travers93@live.com,03 21 23 69 69,06 61 51 90 25,Baccalauréat,"1.925,00",56
9,Capucine,Verne,1977-05-15,47,Pourpre,Loup,Boeuf Bourguignon,NB,&P2D_xH&%dhFdg3,Nice,Alpes-Maritimes,6400,verne77@webmail.free.fr,04 93 68 11 49,06 10 82 11 71,Licentiate,"3.025,00",37


In [31]:
# clean columns
for col in users_fr.columns:
    users_fr[col] = users_fr[col].apply(lambda x: clean_column(x, EXCLUSION_LIST))
# hash passwords
users_fr['password'] = users_fr['password'].apply(hash_password)
# clean dob
users_fr['dob'] = users_fr.apply(lambda x: clean_dob(x['dob'], x['age_last_birthday']), axis=1)
# clean gender
users_fr = map_gender(users_fr, fr_mapping_gender)
# clean phone numbers
clean_phone_number(users_fr, 'phone')
clean_phone_number(users_fr, 'mobile')
# clean salary
users_fr['salary'] = users_fr['salary'].apply(clean_salary)
users_fr

TypeError: expected string or bytes-like object, got 'float'

In [35]:
#users_fr['salary'].apply(clean_salary)


s = users_fr["salary"]

# convert to string only where not null, strip symbols/commas; keep dot for decimals
s_clean = s.where(s.isna(), s.astype(str).str.replace(r"[^\d.]", "", regex=True))

# to numeric with errors='coerce' will turn bad/empty strings into NaN
nums = pd.to_numeric(s_clean, errors="coerce")

users_fr["salary_clean"] = (nums / 100.0) * 12  # example: monthly -> annual

users_fr

Unnamed: 0,first_name,surname,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,education,salary,website_visits_last_30_days,salary_clean
0,Adèle Françoise,Bisset,1917-10-01,108,Jaune,Tigre,Ratatouille,,2ba831556eaaea730f6a9b922b50dbc4770c42c9364b37...,Villevenard,Marne,51270,bisset16@live.com,326805240,611530093,Baccalauréat,1581.0,17,189.72
1,Adrien Jacques,Abadie,1985-05-05,39,Bleu,Cheval,Cassoulet,,af68b910075327aa070afdb2293c1963498efa7c599914...,Lille,Nord,59800,ajabadie@outlook.com,320158440,681430010,Licentiate,2979.5,25,357.54
2,Bruno Jean-Baptiste,Chevrolet,1931-06-26,94,Gris,Mouton,Quiche lorraine,,75c9470799d574a2e560813e1e8fb4617b4744e7b6fbd7...,Tarbes,Hautes-Pyrénées,65000,bjbchevy30@live.com,562343236,688762726,Baccalauréat,1058.0,29,126.96
3,Cassandre,Fortier,2002-03-01,22,Marron,Poule,Crêpes,,eae5649b48f35da084d93f4551eb1824ad705ddc8309cb...,Béziers,Hérault,34500,fortier02@webmail.free.fr,467367373,677707703,Master,3785.5,44,454.26
4,Ugène,Gagnon,1948-01-05,77,Rouge,Cochon,Bouillabaisse,,408d2bc59e7987fde2a2eb97ca8f1ffaea4438cec42d5b...,Créteil,Val-de-Marne,94000,rougecouchon@mail.ru,183755656,601000069,CFA,1581.0,7,189.72
5,José-Maria,Lamar,2011-10-13,13,Vert,Lapin,Chocolate soufflé,,24728f73a4778c01703fb763677d9910915b95f3d2cbb4...,Poitiers,Vienne,86000,lapinfou67@list-manage.com,549881234,695831362,Collège,,14,
6,Sacha,Martel,2008-11-02,16,Rose,Poisson Rouge,Tarte Tatin,,c26ad42dea87e57b40e2240bac2a11a687e6246583e135...,Vannes,Morbihan,56000,sachalepoisson@live.com,297543434,788157558,Lycée,125.0,32,15.0
7,Elvire Françoise,Sartre,1963-02-11,62,Noir,Souris,Croque monsieur,,99c7eb4f1f78ee162e46a83c7b20ee293d72ba6a5a32ad...,Nevers,Nièvre,58000,noirsartre@outlook.com,386361515,789631357,Doctorat,4800.0,24,576.0
8,Émile Jean,Travers,1993-02-09,32,Argent,Cerf,Coq au vin,,56d201df5de2d7bd569fd220e770a4499add9e7686f927...,Arras,Pas-de-Calais,62000,travers93@live.com,321236969,661519025,Baccalauréat,1925.0,56,231.0
9,Capucine,Verne,1977-05-15,47,Pourpre,Loup,Boeuf Bourguignon,,d068f18c83e45cff0fd252d81bdbce0ecebd9d242d01c1...,Nice,Alpes-Maritimes,6400,verne77@webmail.free.fr,493681149,610821171,Licentiate,3025.0,37,363.0


In [None]:
# identify opportunities to "tidy code"
# give it a good clean too!
# BAC mapping to RQF

In [20]:
def clean_timestamp(ts):
    return datetime.fromtimestamp(ts, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')

In [21]:
logins = pd.read_csv('data/UK-User-LoginTS.csv')
logins = clean_cols(logins)
logins['logints'] = logins['logints'].apply(clean_timestamp)
logins.columns = ['login_id', 'username', 'login_timestamp']
logins.head()

Unnamed: 0,login_id,username,login_timestamp
0,1,card49a@gmail.com,2025-01-05 10:12:40
1,2,card49a@gmail.com,2025-01-09 20:39:23
2,3,card49a@gmail.com,2025-01-14 06:52:53
3,4,card49a@gmail.com,2025-01-18 17:10:01
4,5,card49a@gmail.com,2025-01-23 03:28:32


In [22]:
# quick validation
logins.isna().sum()

login_id           0
username           0
login_timestamp    0
dtype: int64

In [23]:
logins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 3 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   login_id         248 non-null    int64 
 1   username         248 non-null    object
 2   login_timestamp  248 non-null    object
dtypes: int64(1), object(2)
memory usage: 5.9+ KB


In [28]:
with open('create_database.sql', 'r', encoding='utf-8') as f:
    create_sql = f.read()
try:
    conn = sqlite3.connect('customers.db')
    conn.executescript(create_sql)
    users.to_sql('users', conn, if_exists='append', index=False)
    logins.to_sql('logins', conn, if_exists='append', index=False)
    conn.commit()
finally:
    conn.close()

NameError: name 'users' is not defined