# Data Preparation

In [1]:
import numpy as np 
import pandas as pd

from datetime import datetime

In [8]:
# Load dataset
dataset_path = 'ass1/data/dataset_mood_smartphone.csv'
df = pd.read_csv(dataset_path, index_col=False).drop(columns='Unnamed: 0')

def clean_dataset(df):
    # from id
    df["id"] = df["id"].str.extract(r"AS14\.(\d{2})")[0].astype(float).astype(int)
    
    # from time (ensure it's in datetime format)
    df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S.%f")  # Keep as datetime

    # Extract required columns
    df["date"] = df["time"].dt.date  # Extract date
    df["day_of_the_week"] = df["time"].dt.day_name()  # Extract day name
    df["hour_of_the_day"] = df["time"].dt.hour  # Extract hour

    # from variable
    df['record_type'] = df['variable'].apply(
        lambda x: {
            'appCat': 'app','circumplex': 'sensor'}.get(x.split('.')[0], x.split('.')[0]))
    df['record_detail'] = df['variable'].apply(
        lambda x: x.split('.')[1] if len(x.split('.')) > 1 else None)

    # select relevant columns
    relevant_cols = [
        'id',
        'time',
        'date',
        'day_of_the_week',
        'hour_of_the_day',
        'record_type',
        'record_detail',
        'value']
    df = df[relevant_cols]

    return df

# Load dataset
dataset_path = 'ass1/data/dataset_mood_smartphone.csv'
df = pd.read_csv(dataset_path, index_col=False).drop(columns='Unnamed: 0')

# Clean dataset
df = clean_dataset(df)

# Save cleaned dataset
cleaned_dataset_path = 'ass1/data/dataset_clean.csv'
df.to_csv(cleaned_dataset_path, index=False)

In [9]:
df

Unnamed: 0,id,time,date,day_of_the_week,hour_of_the_day,record_type,record_detail,value
0,1,2014-02-26 13:00:00.000,2014-02-26,Wednesday,13,mood,,6.000
1,1,2014-02-26 15:00:00.000,2014-02-26,Wednesday,15,mood,,6.000
2,1,2014-02-26 18:00:00.000,2014-02-26,Wednesday,18,mood,,6.000
3,1,2014-02-26 21:00:00.000,2014-02-26,Wednesday,21,mood,,7.000
4,1,2014-02-27 09:00:00.000,2014-02-27,Thursday,9,mood,,6.000
...,...,...,...,...,...,...,...,...
376907,30,2014-04-11 07:51:16.948,2014-04-11,Friday,7,app,weather,8.032
376908,30,2014-04-19 11:00:32.747,2014-04-19,Saturday,11,app,weather,3.008
376909,30,2014-04-26 10:19:07.434,2014-04-26,Saturday,10,app,weather,7.026
376910,30,2014-04-27 00:44:48.450,2014-04-27,Sunday,0,app,weather,23.033
