In [1]:
%reset -f

In [48]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
advanced_df = pd.read_csv("dataset_mood_smartphone (1).csv", index_col=0)

In [3]:
advanced_df.head()

Unnamed: 0,id,time,variable,value
1,AS14.01,2014-02-26 13:00:00.000,mood,6.0
2,AS14.01,2014-02-26 15:00:00.000,mood,6.0
3,AS14.01,2014-02-26 18:00:00.000,mood,6.0
4,AS14.01,2014-02-26 21:00:00.000,mood,7.0
5,AS14.01,2014-02-27 09:00:00.000,mood,6.0


## Define functions

In [9]:
# Check the shape of a dataframe, along with NaN values
def check_df(df):
    print(df.shape)
    for item in df.columns:
        print(item, ': ', type(df[item]), ', contains NaN:',df[item].isnull().values.any())

# Plot all the variables vs time
def plot_feature(df, variable_name, participant=None):
    ylabel_text = f"{variable_name} value"
    title_text = f"{variable_name} vs time"

    if participant:
        plot = df[(df['variable'] == variable_name) & (df['id'] == participant)]
    else:
        plot = df[(df['variable'] == variable_name)]

    plt.plot(plot['time'], plot['value'], label = variable_name)
    plt.xlabel('time')
    plt.ylabel(ylabel_text)
    plt.title(title_text)
    plt.legend(loc='lower right')
    plt.show()

# Plot histograms of all the variables
def plot_histograms(df, variable_name, participant=None):

    if participant:
        df[(df['variable'] == variable_name) & (df['id'] == participant)].plot(kind='hist', edgecolor='black', legend = variable_name)
    else:
        df[(df['variable'] == variable_name)].plot(kind='hist', edgecolor='black', legend = variable_name)
    plt.xlabel(variable_name)
    plt.legend(loc='lower right')

# Remove outliers >0.75 and <0.25 of all values, excepting mood, call, sms etc
def remove_outliers(df, column_name, multiplier=1.5):
    q1 = df[column_name].quantile(0.05)
    q2 = df[column_name].quantile(0.95)
    iqr = q2 - q1
    lower_bound = q1 - multiplier * iqr
    upper_bound = q2 + multiplier * iqr
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

# Check a dataframe for class balance
def check_class_balance(df, target_column):
    print(df[target_column].value_counts())

## Check for dataframe shape and NaN values

In [5]:
check_df(advanced_df)

(376912, 4)
id :  <class 'pandas.core.series.Series'> , contains NaN: False
time :  <class 'pandas.core.series.Series'> , contains NaN: False
variable :  <class 'pandas.core.series.Series'> , contains NaN: False
value :  <class 'pandas.core.series.Series'> , contains NaN: True


## Checking which variables have NaN values

In [6]:
advanced_df.query('value.isnull()')

Unnamed: 0,id,time,variable,value
5709,AS14.01,2014-04-02 18:00:00.000,circumplex.arousal,
5731,AS14.01,2014-04-07 15:00:00.000,circumplex.arousal,
5773,AS14.01,2014-04-16 12:00:00.000,circumplex.arousal,
5797,AS14.01,2014-04-21 13:00:00.000,circumplex.arousal,
5836,AS14.01,2014-04-29 09:00:00.000,circumplex.arousal,
...,...,...,...,...
16859,AS14.33,2014-05-16 20:00:00.000,circumplex.valence,
16862,AS14.33,2014-05-17 12:00:00.000,circumplex.valence,
16882,AS14.33,2014-05-21 16:00:00.000,circumplex.valence,
16899,AS14.33,2014-05-24 23:00:00.000,circumplex.valence,


### The only variables that contain NaN values are circumplex.arousal and circumplex.valence. Discussion needs to be made if these values should be removed or be converted to 0s

## Drop rows with NaN values

In [7]:
advanced_df.dropna(inplace=True)
check_df(advanced_df)

(376710, 4)
id :  <class 'pandas.core.series.Series'> , contains NaN: False
time :  <class 'pandas.core.series.Series'> , contains NaN: False
variable :  <class 'pandas.core.series.Series'> , contains NaN: False
value :  <class 'pandas.core.series.Series'> , contains NaN: False


## Sort dataframe by id and time

In [8]:
advanced_df.sort_values(['id', 'time'], inplace=True)

## Plot all features vs time for participant AS14.01

In [None]:
for name in advanced_df['variable'].unique():
    plot_feature(advanced_df, name, 'AS14.01')

## Plot variable values to check for distribution for participant AS14.01

In [None]:
for name in advanced_df['variable'].unique():
    plot_histograms(advanced_df, name, 'AS14.01')

## Calculate mean and sd for each variable

In [None]:
result = advanced_df.groupby('variable')['value'].agg(['mean', 'std'])

## Remove outliers for all participants

In [10]:
filtered_data = []
exception_list = ['mood', 'sms', 'call', 'circumplex.arousal', 'circumplex.valence', 'activity']

for name in advanced_df['variable'].unique():
    if name in exception_list:
        filtered_data.append(advanced_df[advanced_df['variable'] == name])
        continue
    else:
        group = advanced_df[advanced_df['variable'] == name]
        filtered_group = remove_outliers(group, 'value')
        filtered_data.append(filtered_group)

clean_df = pd.concat(filtered_data)

## Check variable vs time after outliers removed

In [None]:
for name in clean_df['variable'].unique():
    plot_feature(clean_df, name, 'AS14.01')

## Check variable distribution for participant AS14.01 with the new data

In [None]:
for name in advanced_df['variable'].unique():
    plot_histograms(advanced_df, name, 'AS14.01')

# Data engineering

## For each variable, calculate the average value if the variable is continuous and the count if the value is categorical, for each day, and distinguish the variable column to different features.

In [11]:
# Convert timestamp to datetime object and extract date
clean_df['time'] = pd.to_datetime(clean_df['time']).dt.date

# List of continuous and categorical variables
continuous_vars = ['mood', 'screen', 'appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather', 'circumplex.arousal', 'circumplex.valence', 'activity']
categorical_vars = ['call', 'sms']

# Define aggregation functions for continuous and categorical variables
aggregations = {}
for var in continuous_vars:
    aggregations[var] = 'mean'
for var in categorical_vars:
    aggregations[var] = 'count'

In [12]:
# Filter the dataframe for continuous and categorical variables
cont_df = clean_df[clean_df['variable'].isin(continuous_vars)].pivot_table(index=['id', 'time'], columns='variable', values='value').reset_index()
cat_df = clean_df[clean_df['variable'].isin(categorical_vars)].pivot_table(index=['id', 'time'], columns='variable', values='value', aggfunc='count').reset_index()

# Merge the dataframes
result = pd.merge(cont_df, cat_df, on=['id', 'time'], how='outer')

## Impute missing values

In [49]:
# Create an empty DataFrame with the same columns as the original
imputed_data = pd.DataFrame(columns=result.columns)

# Iterate through unique participant ids
for participant in result['id'].unique():
    participant_data = result[result['id'] == participant]

    # Calculate the mean of each column for the participant, excluding the 'id' column
    participant_mean = participant_data.drop(columns=['id']).mean()

    # Fill missing values with the participant-specific mean for each column
    participant_imputed = participant_data.copy()
    for column in participant_mean.index:
        participant_imputed[column] = participant_data[column].fillna(participant_mean[column])

    # Append the imputed participant data to the imputed_data DataFrame
    imputed_data = imputed_data.append(participant_imputed, ignore_index=True)


  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)

In [50]:
imputed_data.replace(np.nan,0, inplace=True)

In [51]:
imputed_data.set_index('time', inplace=True)

# ML classifier

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error

## Section for classic ML models

In [52]:
X = imputed_data.drop(columns=['mood'])
y = imputed_data['mood']
X.drop(['id'], axis = 1, inplace = True)

In [53]:
y = y.round().astype(int)

In [54]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random Forest

In [55]:
rf_regressor = RandomForestRegressor(random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [56]:
mae_scores = []
mse_scores = []

In [57]:
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    rf_regressor.fit(X_train, y_train)
    y_pred = rf_regressor.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    mae_scores.append(mae)
    mse_scores.append(mse)

In [59]:
mean_mae = np.mean(mae_scores)
mean_mse = np.mean(mse_scores)
print(f'Mean Absolute Error (MAE) across 5 folds: {mean_mae}')
print(f'Mean Squared Error (MSE) across 5 folds: {mean_mse}')

Mean Absolute Error (MAE) across 5 folds: 0.2672120147908807
Mean Squared Error (MSE) across 5 folds: 0.18721925991759994
