In [1]:
import hashlib
import re
from datetime import datetime, timezone
import pandas as pd

In [2]:
EXCLUSION_LIST = ['BLANK', '-', 'NA', 'NONE', '{NULL}']

In [3]:
# clean column names
def clean_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]
    return df

In [4]:
def clean_dob(dob, age):
    if dob > datetime.now() and age < 100:
        dob = dob.replace(year=datetime.now().year - age)
    if age >= 100 and dob.year > datetime.now().year - 100:
        dob = dob.replace(year=datetime.now().year - age)
    return dob

In [5]:
def clean_column(value, exclusions=EXCLUSION_LIST):
    if isinstance(value, str) and value.strip().upper() in exclusions:
        return None
    elif isinstance(value, float) and pd.isna(value):
        return None
    else:
        return value

In [6]:
def hash_password(pw: str) -> str:
    if pw is None:
        return None
    else:
        return hashlib.sha256(pw.encode('utf-8')).hexdigest()

In [7]:
def clean_salary(salary: str) -> str:
    if salary is None:
        return None
    else:
        return re.sub(r'[^\d.]', '', salary)

In [8]:
users = pd.read_csv('data/UK User Data.csv', encoding='latin1')
users = clean_cols(users)
users['dob'] = pd.to_datetime(users['dob'], format='%d/%m/%y', errors='coerce')
users.head()

Unnamed: 0,first_name,surname,middle_initials,dob,age_last_birthday,favourite_colour,favourite_animal,favourite_food,gender,password,city,county,postcode,email,phone,mobile,rqf,salary,website_visits_last_30_days
0,Derek,Card,A,2065-01-07,60,Red,Elephant,Bangers and Mash,Male,Parishaggis17%,Arundel,West Sussex,BN18 9PA,card49a@gmail.com,01903 882543,07787 557197,3,"19,500.00",7
1,David,Button,none,1999-08-22,25,Green,Giraffe,Cottage Pie,Male,CarTrain 56$,Bath,Avon,BA1 2QZ,button76@outlook.com,01225 413106,07961 102199,4,"21,000.00",15
2,Ian,Smythe,JO,2025-01-03,100,Blue,Cat,Toad in the Hole,blank,1945Tank*,Chester,Cheshire,CH2 1EU,long.65.morning@icloud.com,01244 380280,07594 146913,5,"23,000.00",28
3,Samantha,Jones,D,1991-03-24,33,Indigo,Wolf,Roast,Female,Yorkshire!3Pig,Dursley,Gloucestershire,GL11 4CD,busybusy@yahoo.com,01453 580136,07577 752530,6,"32,500.00",34
4,Wendy,Brown,L,2014-01-29,11,Pink,Puppy,Fish and Chips,Female,Snoopy78Peanut!,Frome,Somerset,BA11 7RT,brownsheep@flock.com,01373 253333,07768 852327,-,na,3


In [9]:
# clean columns
for col in users.columns:
    users[col] = users[col].apply(clean_column)
# hash passwords
users['password'] = users['password'].apply(hash_password)
# clean dob
users['dob'] = users.apply(lambda x: clean_dob(x['dob'], x['age_last_birthday']), axis=1)
# clean salary
users['salary'] = users['salary'].apply(clean_salary)

In [10]:
# import other file
# 'Website Visits' validation

In [11]:
def clean_timestamp(ts):
    return datetime.fromtimestamp(ts, tz=timezone.utc).strftime('%Y-%m-%d %H:%M:%S')

In [12]:
logins = pd.read_csv('data/UK-User-LoginTS.csv')
logins = clean_cols(logins)
logins['logints'] = logins['logints'].apply(clean_timestamp)
logins.head()

Unnamed: 0,rowid,username,logints
0,1,card49a@gmail.com,2025-01-05 10:12:40
1,2,card49a@gmail.com,2025-01-09 20:39:23
2,3,card49a@gmail.com,2025-01-14 06:52:53
3,4,card49a@gmail.com,2025-01-18 17:10:01
4,5,card49a@gmail.com,2025-01-23 03:28:32
