In [25]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [26]:
advanced_df = pd.read_csv("dataset_mood_smartphone (1).csv", index_col=0)

In [27]:
advanced_df.head()

Unnamed: 0,id,time,variable,value
1,AS14.01,2014-02-26 13:00:00.000,mood,6.0
2,AS14.01,2014-02-26 15:00:00.000,mood,6.0
3,AS14.01,2014-02-26 18:00:00.000,mood,6.0
4,AS14.01,2014-02-26 21:00:00.000,mood,7.0
5,AS14.01,2014-02-27 09:00:00.000,mood,6.0


## Define function

In [37]:
# Check the shape of a dataframe, along with NaN values
def check_df(df):
    print(df.shape)
    for item in df.columns:
        print(item, ': ', type(df[item]), ', contains NaN:',df[item].isnull().values.any())

# Plot all the variables vs time
def plot_feature(df, variable_name):
    ylabel_text = f"{variable_name} value"
    title_text = f"{variable_name} vs time"
    plot = df[(df['variable'] == variable_name) & (df['id'] == specific_id)]

    plt.plot(plot['time'], plot['value'])
    plt.xlabel('time')
    plt.ylabel(ylabel_text)
    plt.title(title_text)
    plt.show()

# Plot histograms of all the variables
def plot_histograms(df, variable_name):
    df[(df['variable'] == variable_name) & (df['id'] == specific_id)].plot(kind='hist', edgecolor='black');
    plt.xlabel(variable_name)

# Remove outliers >0.75 and <0.25 of all values
def remove_outliers(df, column_name, multiplier=1.5):
    q1 = df[column_name].quantile(0.25)
    q2 = df[column_name].quantile(0.75)
    iqr = q2 - q1
    lower_bound = q1 - multiplier * iqr
    upper_bound = q2 + multiplier * iqr
    return df[(df[column_name] >= lower_bound) & (df[column_name] <= upper_bound)]

## Check for dataframe shape and NaN values

In [29]:
check_df(advanced_df)

(376912, 4)
id :  <class 'pandas.core.series.Series'> , contains NaN: False
time :  <class 'pandas.core.series.Series'> , contains NaN: False
variable :  <class 'pandas.core.series.Series'> , contains NaN: False
value :  <class 'pandas.core.series.Series'> , contains NaN: True


## Drop rows with NaN values

In [30]:
advanced_df.dropna(inplace=True)
check_df(advanced_df)

(376710, 4)
id :  <class 'pandas.core.series.Series'> , contains NaN: False
time :  <class 'pandas.core.series.Series'> , contains NaN: False
variable :  <class 'pandas.core.series.Series'> , contains NaN: False
value :  <class 'pandas.core.series.Series'> , contains NaN: False


## Sort dataframe by id and time

In [31]:
advanced_df = advanced_df.sort_values(['id', 'time'])

## Set the participant to AS14.01, plot all features vs time

In [32]:
specific_id = 'AS14.01'

In [None]:
for name in advanced_df['variable'].unique():
    plot_feature(advanced_df, name)

## Plot variable values to check for distribution for participant AS14.01

In [None]:
for name in advanced_df['variable'].unique():
    plot_histograms(advanced_df, name)

## Calculate mean and sd for each variable

In [36]:
result = advanced_df.groupby('variable')['value'].agg(['mean', 'std'])

## Remove outliers for all participants

In [38]:
filtered_data = []
for name in advanced_df['variable'].unique():
    group = advanced_df[advanced_df['variable'] == name]
    filtered_group = remove_outliers(group, 'value')
    filtered_data.append(filtered_group)

clean_df = pd.concat(filtered_data)

## Check variable distribution for participant AS14.01 with the new data

In [None]:
for name in clean_df['variable'].unique():
    plot_histograms(clean_df, name)

In [42]:
X = clean_df[clean_df['variable']!='mood']
y = clean_df[clean_df['variable']=='mood']