In [156]:
%reset -f

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [2]:
advanced_df = pd.read_csv("dataset_mood_smartphone (1).csv", index_col=0)

In [3]:
advanced_df.head()

Unnamed: 0,id,time,variable,value
1,AS14.01,2014-02-26 13:00:00.000,mood,6.0
2,AS14.01,2014-02-26 15:00:00.000,mood,6.0
3,AS14.01,2014-02-26 18:00:00.000,mood,6.0
4,AS14.01,2014-02-26 21:00:00.000,mood,7.0
5,AS14.01,2014-02-27 09:00:00.000,mood,6.0


In [4]:
print(advanced_df['id'].nunique())

27


## Define functions

In [5]:
# Check the shape of a dataframe, along with NaN values
def check_df(df):
    print(df.shape)
    for item in df.columns:
        print(item, ': ', type(df[item]), ', contains NaN:',df[item].isnull().values.any())

# Plot all the variables vs time
def plot_feature(df, variable_name, participant=None):
    ylabel_text = f"{variable_name} value"
    title_text = f"{variable_name} vs time"

    if participant:
        plot = df[(df['variable'] == variable_name) & (df['id'] == participant)]
    else:
        plot = df[(df['variable'] == variable_name)]

    plt.plot(plot['time'], plot['value'], label = variable_name)
    plt.xlabel('time')
    plt.ylabel(ylabel_text)
    plt.title(title_text)
    plt.legend(loc='lower right')
    plt.show()

# Plot histograms of all the variables
def plot_histograms(df, variable_name, participant=None):

    if participant:
        df[(df['variable'] == variable_name) & (df['id'] == participant)].plot(kind='hist', bins=10, edgecolor='black', legend = variable_name)
    else:
        df[(df['variable'] == variable_name)].plot(kind='hist', bins=10, edgecolor='black', legend = variable_name)
    plt.xlabel(variable_name)
    plt.legend(loc='lower right')

# Remove outliers >0.75 and <0.25 of all values, excepting mood, call, sms etc
def remove_outliers(df, column_name, multiplier=1.5):
    q1 = df[column_name].quantile(0.05)
    q2 = df[column_name].quantile(0.95)
    iqr = q2 - q1
    lower_bound = q1 - multiplier * iqr
    upper_bound = q2 + multiplier * iqr
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

# Check a dataframe for class balance
def check_class_balance(df, target_column):
    print(df[target_column].value_counts())

# Remove negative values
def filter_negative_values(row):
    variable = row['variable']
    value = row['value']

    if variable not in ['circumplex.arousal', 'circumplex.valence'] and value < 0:
        return False
    return True

## Check for dataframe shape and NaN values

In [6]:
check_df(advanced_df)

(376912, 4)
id :  <class 'pandas.core.series.Series'> , contains NaN: False
time :  <class 'pandas.core.series.Series'> , contains NaN: False
variable :  <class 'pandas.core.series.Series'> , contains NaN: False
value :  <class 'pandas.core.series.Series'> , contains NaN: True


## Checking which variables have NaN values

In [7]:
advanced_df.query('value.isnull()')

Unnamed: 0,id,time,variable,value
5709,AS14.01,2014-04-02 18:00:00.000,circumplex.arousal,
5731,AS14.01,2014-04-07 15:00:00.000,circumplex.arousal,
5773,AS14.01,2014-04-16 12:00:00.000,circumplex.arousal,
5797,AS14.01,2014-04-21 13:00:00.000,circumplex.arousal,
5836,AS14.01,2014-04-29 09:00:00.000,circumplex.arousal,
...,...,...,...,...
16859,AS14.33,2014-05-16 20:00:00.000,circumplex.valence,
16862,AS14.33,2014-05-17 12:00:00.000,circumplex.valence,
16882,AS14.33,2014-05-21 16:00:00.000,circumplex.valence,
16899,AS14.33,2014-05-24 23:00:00.000,circumplex.valence,


In [8]:
print('circumplex.arousal NaNs count: ', advanced_df[advanced_df['variable']=='circumplex.arousal'].isna().sum())
print('circumplex.valence NaNs count: ', advanced_df[advanced_df['variable']=='circumplex.valence'].isna().sum())

circumplex.arousal NaNs count:  id           0
time         0
variable     0
value       46
dtype: int64
circumplex.valence NaNs count:  id            0
time          0
variable      0
value       156
dtype: int64


In [9]:
print(advanced_df['variable'].value_counts())

screen                  96578
appCat.builtin          91288
appCat.communication    74276
appCat.entertainment    27125
activity                22965
appCat.social           19145
appCat.other             7650
circumplex.valence       5643
circumplex.arousal       5643
appCat.office            5642
mood                     5641
call                     5239
appCat.travel            2846
appCat.utilities         2487
sms                      1798
appCat.finance            939
appCat.unknown            939
appCat.game               813
appCat.weather            255
Name: variable, dtype: int64


In [10]:
grouped_data = advanced_df.groupby('variable')

# Calculate the min and max for each group
min_values = grouped_data['value'].min()
max_values = grouped_data['value'].max()

# Calculate the range for each group
range_values = max_values - min_values

# Combine the min, max, and range values into a single DataFrame
ranges = pd.concat([min_values, max_values, range_values], axis=1)
ranges.columns = ['min_value', 'max_value', 'range']

# Print the range for each unique feature in the 'variable' column
print("Range of values for each unique feature in the 'variable' column:")
print(ranges)

Range of values for each unique feature in the 'variable' column:
                      min_value  max_value       range
variable                                              
activity                  0.000      1.000       1.000
appCat.builtin       -82798.871  33960.246  116759.117
appCat.communication      0.006   9830.777    9830.771
appCat.entertainment     -0.011  32148.677   32148.688
appCat.finance            0.131    355.513     355.382
appCat.game               1.003   5491.793    5490.790
appCat.office             0.003  32708.818   32708.815
appCat.other              0.014   3892.038    3892.024
appCat.social             0.094  30000.906   30000.812
appCat.travel             0.080  10452.615   10452.535
appCat.unknown            0.111   2239.937    2239.826
appCat.utilities          0.246   1802.649    1802.403
appCat.weather            1.003    344.863     343.860
call                      1.000      1.000       0.000
circumplex.arousal       -2.000      2.000       4.000

### The only variables that contain NaN values are circumplex.arousal and circumplex.valence. Discussion needs to be made if these values should be removed or be converted to 0s

## Drop rows with NaN values

In [11]:
advanced_df.dropna(inplace=True)
check_df(advanced_df)

(376710, 4)
id :  <class 'pandas.core.series.Series'> , contains NaN: False
time :  <class 'pandas.core.series.Series'> , contains NaN: False
variable :  <class 'pandas.core.series.Series'> , contains NaN: False
value :  <class 'pandas.core.series.Series'> , contains NaN: False


In [12]:
advanced_df = advanced_df[advanced_df.apply(filter_negative_values, axis=1)]

## Sort dataframe by id and time

In [13]:
advanced_df.sort_values(['id', 'time'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  advanced_df.sort_values(['id', 'time'], inplace=True)


## Plot all features vs time for participant AS14.01

In [None]:
for name in advanced_df['variable'].unique():
    plot_feature(advanced_df, name, 'AS14.01')

## Plot variable values to check for distribution for participant AS14.01

In [None]:
for name in advanced_df['variable'].unique():
    plot_histograms(advanced_df, name)

## Calculate mean and sd for each variable

In [14]:
result = advanced_df.groupby('variable')['value'].agg(['mean', 'std'])

## Remove outliers for all participants

In [15]:
filtered_data = []
exception_list = ['mood', 'sms', 'call', 'circumplex.arousal', 'circumplex.valence', 'activity']

for name in advanced_df['variable'].unique():
    if name in exception_list:
        filtered_data.append(advanced_df[advanced_df['variable'] == name])
        continue
    else:
        group = advanced_df[advanced_df['variable'] == name]
        filtered_group = remove_outliers(group, 'value')
        filtered_data.append(filtered_group)

clean_df = pd.concat(filtered_data)

## Check variable vs time after outliers removed

In [None]:
for name in clean_df['variable'].unique():
    plot_feature(clean_df, name, 'AS14.01')

## Check variable distribution for participant AS14.01 with the new data

In [None]:
for name in advanced_df['variable'].unique():
    plot_histograms(advanced_df, name)

# Data engineering

## For each variable, calculate the average value if the variable is continuous and the count if the value is categorical, for each day, and distinguish the variable column to different features.

In [16]:
# Convert timestamp to datetime object and extract date
clean_df['time'] = pd.to_datetime(clean_df['time']).dt.date

# List of continuous and categorical variables
continuous_vars = ['mood', 'screen', 'appCat.builtin', 'appCat.communication', 'appCat.entertainment', 'appCat.finance', 'appCat.game', 'appCat.office', 'appCat.other', 'appCat.social', 'appCat.travel', 'appCat.unknown', 'appCat.utilities', 'appCat.weather', 'circumplex.arousal', 'circumplex.valence', 'activity']
categorical_vars = ['call', 'sms']

# Define aggregation functions for continuous and categorical variables
aggregations = {}
for var in continuous_vars:
    aggregations[var] = 'mean'
for var in categorical_vars:
    aggregations[var] = 'count'

In [17]:
# Filter the dataframe for continuous and categorical variables
cont_df = clean_df[clean_df['variable'].isin(continuous_vars)].pivot_table(index=['id', 'time'], columns='variable', values='value').reset_index()
cat_df = clean_df[clean_df['variable'].isin(categorical_vars)].pivot_table(index=['id', 'time'], columns='variable', values='value', aggfunc='count').reset_index()

# Merge the dataframes
result = pd.merge(cont_df, cat_df, on=['id', 'time'], how='outer')

In [34]:
result.head()

variable,id,time,activity,appCat.builtin,appCat.communication,appCat.entertainment,appCat.finance,appCat.game,appCat.office,appCat.other,...,appCat.travel,appCat.unknown,appCat.utilities,appCat.weather,circumplex.arousal,circumplex.valence,mood,screen,call,sms
0,AS14.01,2014-02-26,,,,,,,,,...,,,,,-0.25,0.75,6.25,,1.0,2.0
1,AS14.01,2014-02-27,,,,,,,,,...,,,,,0.0,0.333333,6.333333,,,
2,AS14.01,2014-03-20,0.081548,5.928071,44.719306,58.476,,,,5.6725,...,,45.173,10.537,,,,,182.879625,1.0,
3,AS14.01,2014-03-21,0.13405,7.180986,49.185839,53.024,6.193,,57.402,29.968875,...,40.5785,,13.10324,,0.2,0.2,6.2,89.972486,6.0,
4,AS14.01,2014-03-22,0.23688,5.126204,51.697062,46.662,7.025333,,,14.020429,...,37.305,,39.207,,0.6,0.5,6.4,47.613651,3.0,1.0


## Impute missing values

In [18]:
# Create an empty DataFrame with the same columns as the original
imputed_data = pd.DataFrame(columns=result.columns)

# Iterate through unique participant ids
for participant in result['id'].unique():
    participant_data = result[result['id'] == participant]

    # Calculate the mean of each column for the participant, excluding the 'id' column
    participant_mean = participant_data.drop(columns=['id']).mean()

    # Fill missing values with the participant-specific mean for each column
    participant_imputed = participant_data.copy()
    for column in participant_mean.index:
        participant_imputed[column] = participant_data[column].fillna(participant_mean[column])

    # Append the imputed participant data to the imputed_data DataFrame
    imputed_data = imputed_data.append(participant_imputed, ignore_index=True)

  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)
  participant_mean = participant_data.drop(columns=['id']).mean()
  imputed_data = imputed_data.append(participant_imputed, ignore_index=True)

In [19]:
imputed_data.replace(np.nan,0, inplace=True)

In [20]:
imputed_data.set_index('time', inplace=True)

# ML classifier

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, make_scorer

## Section for classic ML models

In [22]:
X = imputed_data.drop(columns=['mood'])
y = imputed_data['mood']
X.drop(['id'], axis = 1, inplace = True)

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Random forest classifier

In [24]:
rf_regressor = RandomForestRegressor(random_state=42)
kf = KFold(n_splits=5, shuffle=True, random_state=42)

In [25]:
# Set up the hyperparameter grid for the random forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20]
}

In [26]:
# Create custom scorers
scorers = {
    'mae': make_scorer(mean_absolute_error, greater_is_better=False),
    'mse': make_scorer(mean_squared_error, greater_is_better=False)
}

In [27]:
# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_regressor, param_grid=param_grid, cv=kf, scoring=scorers, refit='mae')

In [28]:
# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=RandomForestRegressor(random_state=42),
             param_grid={'max_depth': [None, 10, 20],
                         'n_estimators': [50, 100, 200]},
             refit='mae',
             scoring={'mae': make_scorer(mean_absolute_error, greater_is_better=False),
                      'mse': make_scorer(mean_squared_error, greater_is_better=False)})

In [29]:
# Get the best hyperparameters and the corresponding scores
best_params = grid_search.best_params_
best_mae_score = -grid_search.cv_results_['mean_test_mae'][grid_search.best_index_]
best_mse_score = -grid_search.cv_results_['mean_test_mse'][grid_search.best_index_]

In [30]:
print("Best hyperparameters:", best_params)
print(f"Best Mean Absolute Error (MAE): {best_mae_score}")
print(f"Best Mean Squared Error (MSE): {best_mse_score}")

# Train the random forest with the best hyperparameters on the whole dataset
best_rf_regressor = RandomForestRegressor(**best_params, random_state=42)

Best hyperparameters: {'max_depth': None, 'n_estimators': 100}
Best Mean Absolute Error (MAE): 0.2160761888338379
Best Mean Squared Error (MSE): 0.12263683220746242


In [31]:
best_rf_regressor.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [32]:
y_pred = best_rf_regressor.predict(X_test)
# Calculate the evaluation metrics on the test dataset
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)

print(f"Test Mean Absolute Error (MAE): {test_mae}")
print(f"Test Mean Squared Error (MSE): {test_mse}")

Test Mean Absolute Error (MAE): 0.20474988371363362
Test Mean Squared Error (MSE): 0.12084605616884014


In [33]:
# Make predictions on the training dataset
y_train_pred = best_rf_regressor.predict(X_train)

# Calculate the evaluation metrics on the training dataset
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)

print(f"Train Mean Absolute Error (MAE): {train_mae}")
print(f"Train Mean Squared Error (MSE): {train_mse}")

print(f"Test Mean Absolute Error (MAE): {test_mae}")
print(f"Test Mean Squared Error (MSE): {test_mse}")

Train Mean Absolute Error (MAE): 0.0792783031713945
Train Mean Squared Error (MSE): 0.016289351601723026
Test Mean Absolute Error (MAE): 0.20474988371363362
Test Mean Squared Error (MSE): 0.12084605616884014
