# Data Preparation

In [20]:
import numpy as np 
import pandas as pd

from datetime import datetime

In [88]:
# Load dataset
dataset_path = 'ass1/data/dataset_mood_smartphone.csv'
df = pd.read_csv(dataset_path, index_col=False).drop(columns='Unnamed: 0')

def clean_dataset(df):
    
    # from id
    df["id"] = df["id"].str.extract(r"AS14\.(\d{2})")[0].astype(float).astype(int)
    
    # from time
    df["time"] = pd.to_datetime(df["time"]).dt.strftime("%Y-%m-%d %H:%M:%S.%f").str[:-3]
    # Extract required columns
    df["date"] = df["time"].dt.date  # Extracts date (YYYY-MM-DD)
    df["day_of_the_week"] = df["time"].dt.day_name()  # Extracts day of the week
    df["hour_of_the_day"] = df["time"].dt.hour  # Extracts hour (0-23)
    
    # from variable
    df['record_type'] = df['variable'].apply(lambda x: {'appCat': 'app', 'circumplex': 'sensor'}.get(x.split('.')[0], x.split('.')[0]))
    df['record_detail'] = df['variable'].apply(lambda x: x.split('.')[1] if len(x.split('.'))>1 else None)
    


    return df

df = clean_dataset(df)
df

AttributeError: Can only use .dt accessor with datetimelike values

In [93]:
# Load dataset
dataset_path = 'ass1/data/dataset_mood_smartphone.csv'
df = pd.read_csv(dataset_path, index_col=False).drop(columns='Unnamed: 0')

def clean_dataset(df):
    # from id
    df["id"] = df["id"].str.extract(r"AS14\.(\d{2})")[0].astype(float).astype(int)
    
    # from time (ensure it's in datetime format)
    df["time"] = pd.to_datetime(df["time"], format="%Y-%m-%d %H:%M:%S.%f")  # Keep as datetime

    # Extract required columns
    df["date"] = df["time"].dt.date  # Extract date
    df["day_of_the_week"] = df["time"].dt.day_name()  # Extract day name
    df["hour_of_the_day"] = df["time"].dt.hour  # Extract hour

    # from variable
    df['record_type'] = df['variable'].apply(lambda x: {'appCat': 'app', 'circumplex': 'sensor'}.get(x.split('.')[0], x.split('.')[0]))
    df['record_detail'] = df['variable'].apply(lambda x: x.split('.')[1] if len(x.split('.')) > 1 else None)

    # select relevant columns
    relevant_cols = [
        'id',
        'time',
        'date',
        'day_of_the_week',
        'hour_of_the_day',
        'hour_of_the_day',
        'record_type',
        'record_detail',
        'value']
    
    return df

df = clean_dataset(df)
df

Unnamed: 0,id,time,variable,value,date,day_of_the_week,hour_of_the_day,record_type,record_detail
0,1,2014-02-26 13:00:00.000,mood,6.000,2014-02-26,Wednesday,13,mood,
1,1,2014-02-26 15:00:00.000,mood,6.000,2014-02-26,Wednesday,15,mood,
2,1,2014-02-26 18:00:00.000,mood,6.000,2014-02-26,Wednesday,18,mood,
3,1,2014-02-26 21:00:00.000,mood,7.000,2014-02-26,Wednesday,21,mood,
4,1,2014-02-27 09:00:00.000,mood,6.000,2014-02-27,Thursday,9,mood,
...,...,...,...,...,...,...,...,...,...
376907,30,2014-04-11 07:51:16.948,appCat.weather,8.032,2014-04-11,Friday,7,app,weather
376908,30,2014-04-19 11:00:32.747,appCat.weather,3.008,2014-04-19,Saturday,11,app,weather
376909,30,2014-04-26 10:19:07.434,appCat.weather,7.026,2014-04-26,Saturday,10,app,weather
376910,30,2014-04-27 00:44:48.450,appCat.weather,23.033,2014-04-27,Sunday,0,app,weather


array([None, 'arousal', 'valence', 'builtin', 'communication',
       'entertainment', 'finance', 'game', 'office', 'other', 'social',
       'travel', 'unknown', 'utilities', 'weather'], dtype=object)

In [67]:
df['variable'].unique()

array(['mood', 'sensor', 'activity', 'screen', 'call', 'sms', 'app'],
      dtype=object)

In [32]:
df[df['value'].isna()]['variable'].unique()

array(['circumplex.arousal', 'circumplex.valence'], dtype=object)

In [46]:
df["id"] = df["id"].str.extract(r"AS14\.(\d{2})")[0].astype(float).astype(pd.Int64Dtype())
df


Unnamed: 0,id,time,variable,value,id2
0,1,2014-02-26 13:00:00.000,mood,6.000,AS14.01
1,1,2014-02-26 15:00:00.000,mood,6.000,AS14.01
2,1,2014-02-26 18:00:00.000,mood,6.000,AS14.01
3,1,2014-02-26 21:00:00.000,mood,7.000,AS14.01
4,1,2014-02-27 09:00:00.000,mood,6.000,AS14.01
...,...,...,...,...,...
376907,30,2014-04-11 07:51:16.948,appCat.weather,8.032,AS14.30
376908,30,2014-04-19 11:00:32.747,appCat.weather,3.008,AS14.30
376909,30,2014-04-26 10:19:07.434,appCat.weather,7.026,AS14.30
376910,30,2014-04-27 00:44:48.450,appCat.weather,23.033,AS14.30
