In [56]:
%reset -f

In [57]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [58]:
advanced_df = pd.read_csv("dataset_mood_smartphone (1).csv", index_col=0)

In [59]:
advanced_df.head()

Unnamed: 0,id,time,variable,value
1,AS14.01,2014-02-26 13:00:00.000,mood,6.0
2,AS14.01,2014-02-26 15:00:00.000,mood,6.0
3,AS14.01,2014-02-26 18:00:00.000,mood,6.0
4,AS14.01,2014-02-26 21:00:00.000,mood,7.0
5,AS14.01,2014-02-27 09:00:00.000,mood,6.0


## Define functions

In [60]:
# Check the shape of a dataframe, along with NaN values
def check_df(df):
    print(df.shape)
    for item in df.columns:
        print(item, ': ', type(df[item]), ', contains NaN:',df[item].isnull().values.any())

# Plot all the variables vs time
def plot_feature(df, variable_name, participant=None):
    ylabel_text = f"{variable_name} value"
    title_text = f"{variable_name} vs time"

    if participant:
        plot = df[(df['variable'] == variable_name) & (df['id'] == participant)]
    else:
        plot = df[(df['variable'] == variable_name)]

    plt.plot(plot['time'], plot['value'], label = variable_name)
    plt.xlabel('time')
    plt.ylabel(ylabel_text)
    plt.title(title_text)
    plt.legend(loc='lower right')
    plt.show()

# Plot histograms of all the variables
def plot_histograms(df, variable_name, participant=None):

    if participant:
        df[(df['variable'] == variable_name) & (df['id'] == participant)].plot(kind='hist', edgecolor='black', legend = variable_name)
    else:
        df[(df['variable'] == variable_name)].plot(kind='hist', edgecolor='black', legend = variable_name)
    plt.xlabel(variable_name)
    plt.legend(loc='lower right')

# Remove outliers >0.75 and <0.25 of all values, excepting mood, call, sms etc
def remove_outliers(df, column_name, multiplier=1.5):
    exception_list = ['mood', 'sms', 'call', 'circumplex.arousal', 'circumplex.valence', 'activity']
    if column_name in exception_list:
        return
    print(column_name)
    q1 = df[column_name].quantile(0.25)
    q2 = df[column_name].quantile(0.75)
    iqr = q2 - q1
    lower_bound = q1 - multiplier * iqr
    upper_bound = q2 + multiplier * iqr
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

# Check a dataframe for class balance
def check_class_balance(df, target_column):
    print(df[target_column].value_counts())

## Check for dataframe shape and NaN values

In [61]:
check_df(advanced_df)

(376912, 4)
id :  <class 'pandas.core.series.Series'> , contains NaN: False
time :  <class 'pandas.core.series.Series'> , contains NaN: False
variable :  <class 'pandas.core.series.Series'> , contains NaN: False
value :  <class 'pandas.core.series.Series'> , contains NaN: True


## Checking which variables have NaN values

In [62]:
advanced_df.query('value.isnull()')

Unnamed: 0,id,time,variable,value
5709,AS14.01,2014-04-02 18:00:00.000,circumplex.arousal,
5731,AS14.01,2014-04-07 15:00:00.000,circumplex.arousal,
5773,AS14.01,2014-04-16 12:00:00.000,circumplex.arousal,
5797,AS14.01,2014-04-21 13:00:00.000,circumplex.arousal,
5836,AS14.01,2014-04-29 09:00:00.000,circumplex.arousal,
...,...,...,...,...
16859,AS14.33,2014-05-16 20:00:00.000,circumplex.valence,
16862,AS14.33,2014-05-17 12:00:00.000,circumplex.valence,
16882,AS14.33,2014-05-21 16:00:00.000,circumplex.valence,
16899,AS14.33,2014-05-24 23:00:00.000,circumplex.valence,


### The only variables that contain NaN values are circumplex.arousal and circumplex.valence. Discussion needs to be made if these values should be removed or be converted to 0s

## Drop rows with NaN values

In [63]:
advanced_df.dropna(inplace=True)
check_df(advanced_df)

(376710, 4)
id :  <class 'pandas.core.series.Series'> , contains NaN: False
time :  <class 'pandas.core.series.Series'> , contains NaN: False
variable :  <class 'pandas.core.series.Series'> , contains NaN: False
value :  <class 'pandas.core.series.Series'> , contains NaN: False


## Sort dataframe by id and time

In [64]:
advanced_df.sort_values(['id', 'time'], inplace=True)

## Plot all features vs time for participant AS14.01

In [65]:
#for name in advanced_df['variable'].unique():
#    plot_feature(advanced_df, name, 'AS14.01')

## Plot variable values to check for distribution for participant AS14.01

In [66]:
#for name in advanced_df['variable'].unique():
#    plot_histograms(advanced_df, name, 'AS14.01')

## Calculate mean and sd for each variable

In [67]:
result = advanced_df.groupby('variable')['value'].agg(['mean', 'std'])

## Remove outliers for all participants

In [68]:
filtered_data = []
exception_list = ['mood', 'sms', 'call', 'circumplex.arousal', 'circumplex.valence', 'activity']

for name in advanced_df['variable'].unique():
    if name in exception_list:
        filtered_data.append(advanced_df[advanced_df['variable'] == name])
        continue
    else:
        group = advanced_df[advanced_df['variable'] == name]
        filtered_group = remove_outliers(group, 'value')
        filtered_data.append(filtered_group)

clean_df = pd.concat(filtered_data)

value
value
value
value
value
value
value
value
value
value
value
value
value


## Check variable vs time after outliers removed

In [69]:
#for name in clean_df['variable'].unique():
#    plot_feature(clean_df, name, 'AS14.01')

## Check variable distribution for participant AS14.01 with the new data

In [70]:
#for name in advanced_df['variable'].unique():
#   plot_histograms(advanced_df, name, 'AS14.01')

# Data engineering

## For each variable, calculate the average value if the variable is continuous and the count if the value is categorical, for each day, and distinguish the variable column to different features.

In [71]:
# Convert timestamp to datetime object and extract date
clean_df['time'] = pd.to_datetime(clean_df['time']).dt.date

# List of continuous and categorical variables
continuous_vars = ['mood', 'screen', 'appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather', 'circumplex.arousal', 'circumplex.valence', 'activity']
categorical_vars = ['call', 'sms']

# Define aggregation functions for continuous and categorical variables
aggregations = {}
for var in continuous_vars:
    aggregations[var] = 'mean'
for var in categorical_vars:
    aggregations[var] = 'count'


In [72]:
# Filter the dataframe for continuous and categorical variables
cont_df = clean_df[clean_df['variable'].isin(continuous_vars)].pivot_table(index=['id', 'time'], columns='variable', values='value').reset_index()
cat_df = clean_df[clean_df['variable'].isin(categorical_vars)].pivot_table(index=['id', 'time'], columns='variable', values='value', aggfunc='count').reset_index()

# Merge the dataframes
result = pd.merge(cont_df, cat_df, on=['id', 'time'], how='outer')
print(result)


variable       id        time  activity  appCat.builtin  appCat.communication   
0         AS14.01  2014-02-26       NaN             NaN                   NaN  \
1         AS14.01  2014-02-27       NaN             NaN                   NaN   
2         AS14.01  2014-03-20  0.081548        3.036650             29.724576   
3         AS14.01  2014-03-21  0.134050        5.320083             29.082061   
4         AS14.01  2014-03-22  0.236880        5.126204             24.033123   
...           ...         ...       ...             ...                   ...   
1968      AS14.33  2014-04-11       NaN             NaN                   NaN   
1969      AS14.33  2014-04-12       NaN             NaN                   NaN   
1970      AS14.33  2014-04-13       NaN             NaN                   NaN   
1971      AS14.33  2014-04-14       NaN             NaN                   NaN   
1972      AS14.33  2014-04-15       NaN             NaN                   NaN   

variable  appCat.entertainm

In [73]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Initialize the imputer with default settings
imputer = IterativeImputer(random_state=42)

# Fit and transform the imputer on the result DataFrame without 'id' and 'time' columns
result_numeric = result.drop(columns=['id', 'time'])
result_imputed = imputer.fit_transform(result_numeric)

# Replace the original numeric columns with the imputed values
result[result_numeric.columns] = result_imputed

# Check for NaN values in the imputed dataset
print(result.isnull().sum())


variable
id                      0
time                    0
activity                0
appCat.builtin          0
appCat.communication    0
appCat.entertainment    0
appCat.finance          0
appCat.game             0
appCat.office           0
appCat.other            0
appCat.social           0
appCat.travel           0
appCat.unknown          0
appCat.utilities        0
appCat.weather          0
circumplex.arousal      0
circumplex.valence      0
mood                    0
screen                  0
call                    0
sms                     0
dtype: int64




# ML classifier

In [74]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
X = result.loc[:, result.columns != 'mood']
y = result['mood']

# Convert the target variable ('mood') to integer values
y = y.round().astype(int)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Exclude non-numeric columns like 'id' and 'time', and categorical variables 'call' and 'sms'
X_train_numeric = X_train.drop(columns=['id', 'time', 'call', 'sms'])
X_test_numeric = X_test.drop(columns=['id', 'time', 'call', 'sms'])

# Scale the numeric data using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

# Convert scaled arrays back to DataFrames
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_numeric.columns, index=X_train_numeric.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_numeric.columns, index=X_test_numeric.index)

# Concatenate categorical variables 'call' and 'sms' back to the feature sets
X_train_scaled = pd.concat([X_train_scaled, X_train[['call', 'sms']]], axis=1)
X_test_scaled = pd.concat([X_test_scaled, X_test[['call', 'sms']]], axis=1)

# Continue with the machine learning model training and evaluation using X_train_scaled and X_test_scaled


## Section for classic ML models

In [75]:
X = result.loc[:,result.columns != 'mood']
y = result['mood']


In [76]:
y = y.round().astype(int)

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Check for class balance

In [78]:
y_train = pd.DataFrame(y_train)
check_class_balance(y_train, 'mood')

mood
7    1068
8     257
6     212
5      22
9      10
4       7
3       2
Name: count, dtype: int64


## We need to balance these classes to train the model, if not the model will be biased towards predicting more 0s and 7s

## Upsampling with SMOTE

In [80]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Drop 'id' and 'time' columns from the dataset
X = result.drop(columns=['id', 'time', 'mood'])
y = result['mood']

# Convert the target variable ('mood') to integer values
y = y.round().astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42, k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the class balance after resampling
y_train_resampled.value_counts()


mood
7    1068
6    1068
8    1068
5    1068
3    1068
9    1068
4    1068
Name: count, dtype: int64

## Undersampling (probably not useful)

## Section for ARIMA model. The difference is that ARIMA does not need the rest features, only time and mood.