In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import seaborn as sns
import pandas as pd
import os
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import itertools
from DMT_functions import *
from utility_functions import *


In [None]:
df = pd.read_csv('tables/preprocessed/non_imputed.csv')
df = df.drop(columns=["date"])

nans_preview = nan_exploration(df, create_pivot=False)
nans_preview


### Transformations

In [None]:

numerical_df = df.select_dtypes(include=[np.number])
plot_skewness_pyplot(numerical_df=numerical_df)

# function to plot the original and transformed data side by side


plot_original_vs_transformed(data=df, column_name='appCat.office')
plot_original_vs_transformed(data=df, column_name='appCat.travel')
plot_original_vs_transformed(data=df, column_name='appCat.entertainment')
plot_original_vs_transformed(data=df, column_name='appCat.game')
plot_original_vs_transformed(data=df, column_name='appCat.builtin')
plot_original_vs_transformed(data=df, column_name='appCat.other')
plot_original_vs_transformed(data=df, column_name='appCat.social')
plot_original_vs_transformed(data=df, column_name='appCat.unknown')
plot_original_vs_transformed(data=df, column_name='appCat.utilities')
plot_original_vs_transformed(data=df, column_name='appCat.finance')
plot_original_vs_transformed(data=df, column_name='appCat.weather')
plot_original_vs_transformed(data=df, column_name='sms')
plot_original_vs_transformed(data=df, column_name='call')


In [None]:
appCat_columns = [c for c in df.columns if c.startswith('appCat')]

# sum all appCat columns
df['app_usage'] = df[appCat_columns].sum(axis=1, min_count=1) # min count is for nans
# plot the distribution of the new column

numerical_df = df.select_dtypes(include=[np.number])
plot_skewness_pyplot(numerical_df)
# transform the data
plot_original_vs_transformed(data=df, column_name='app_usage')

nan_exploration(df, create_pivot=False, title="combined_app_usage")

# count zeros for all appCat columns
def count_zeros(df, columns):
    zero_counts = {}
    for col in columns:
        zero_counts[col] = int((df[col] == 0).sum())
    return zero_counts
count_zeros(numerical_df, appCat_columns + ['app_usage'])




In [None]:
# correlate all_apps and screen time

numerical_df.corr()

# plot app_usage vs screen time
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='app_usage', y='screen')
plt.title('App Usage vs Screen Time')


In [None]:
# drop appCat columns
df = df.drop(columns=appCat_columns)

numerical_df = df.select_dtypes(include=[np.number])

plot_skewness_pyplot(numerical_df=numerical_df)


plot_original_vs_transformed(data=df, column_name='sms')
plot_original_vs_transformed(data=df, column_name='call')
plot_original_vs_transformed(data=df, column_name='screen')
plot_original_vs_transformed(data=df, column_name='app_usage')



In [None]:
# assume nans in app_usage and screen and call and sms is 0
df['app_usage'] = df['app_usage'].fillna(0)
df['screen'] = df['screen'].fillna(0)
df['call'] = df['call'].fillna(0)
df['sms'] = df['sms'].fillna(0)



In [None]:
# check for nans
nans_preview = nan_exploration(df, create_pivot=False)
nans_preview

In [None]:
# drop missing next day mood instances
df = df.dropna(subset=['next_day_mood'], axis=0)
# check for nans
df.isna().sum()

In [None]:
# mean imputation
df.isna().sum()


# plot histograms with sns in subplots for all columns
def plot_histograms(df, columns):

    cols_with_nans = df[columns].isna().sum()
    cols_with_nans = cols_with_nans[cols_with_nans > 0].index.tolist()
    n = len(columns)
    ncols = 3
    nrows = (n + ncols - 1) // ncols
    fig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(15, 5 * nrows))
    axes = axes.flatten()
    for i, col in enumerate(cols_with_nans):
        sns.histplot(df[col], ax=axes[i], kde=True)
        axes[i].set_title(col)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])
    plt.tight_layout()
    plt.show()
plot_histograms(df, df.columns)



In [None]:
# impute everything ending with min or max with mode
def impute_mode_groupped(df, columns):
    for col in columns:
        df[col] = df.groupby(['id_num'])[col].transform(lambda x: x.fillna(x.mode()[0]) if not x.mode().empty else x)
    return df

df = impute_mode(df, df.columns[df.columns.str.endswith('min') | df.columns.str.endswith('max')])


In [None]:
# impute everything ending with mean grouped by num_id

def impute_mean_grouped(df, columns):
    for col in columns:
        df[col] = df.groupby('id_num')[col].transform(lambda x: x.fillna(x.mean()))
    return df

# numeric column names

numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
print(numeric_columns)
df = impute_mean_grouped(df, numeric_columns)
# check for nans
df.isna().sum()


df.to_csv('tables/preprocessed/mean_mode_imputation_combinedAppCat.csv', index=False)

In [None]:
plotly_all_participants_timeseries(df, show_plot=True, title="mean_mode_imputation")