In [None]:
!pip install torch pytorch-tabnet --quiet
import pandas as pd
import kagglehub
import os
import torch
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
from pytorch_tabnet.tab_model import TabNetClassifier

In [None]:
# Get dataset
path = kagglehub.dataset_download("mahdimashayekhi/social-media-vs-productivity")
csv_file = os.listdir(path)[0]
csv_file_path = os.path.join(path, csv_file)
df = pd.read_csv(csv_file_path)
df

Unnamed: 0,age,gender,job_type,daily_social_media_time,social_platform_preference,number_of_notifications,work_hours_per_day,perceived_productivity_score,actual_productivity_score,stress_level,sleep_hours,screen_time_before_sleep,breaks_during_work,uses_focus_apps,has_digital_wellbeing_enabled,coffee_consumption_per_day,days_feeling_burnout_per_month,weekly_offline_hours,job_satisfaction_score
0,56,Male,Unemployed,4.180940,Facebook,61,6.753558,8.040464,7.291555,4.0,5.116546,0.419102,8,False,False,4,11,21.927072,6.336688
1,46,Male,Health,3.249603,Twitter,59,9.169296,5.063368,5.165093,7.0,5.103897,0.671519,7,True,True,2,25,0.000000,3.412427
2,32,Male,Finance,,Twitter,57,7.910952,3.861762,3.474053,4.0,8.583222,0.624378,0,True,False,3,17,10.322044,2.474944
3,60,Female,Unemployed,,Facebook,59,6.355027,2.916331,1.774869,6.0,6.052984,1.204540,1,False,False,0,4,23.876616,1.733670
4,25,Male,IT,,Telegram,66,6.214096,8.868753,,7.0,5.405706,1.876254,1,False,True,1,30,10.653519,9.693060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,34,Female,Health,1.877297,Facebook,59,10.226358,3.348512,3.465815,8.0,5.480462,1.412655,9,False,False,4,5,21.776927,
29996,39,Male,Health,4.437784,Instagram,46,4.692862,8.133213,6.659294,8.0,3.045393,0.148936,3,False,False,1,29,4.111370,6.155613
29997,42,Male,Education,17.724981,TikTok,64,10.915036,8.611005,8.658912,5.0,5.491520,1.224296,10,False,False,1,2,1.888315,6.285237
29998,20,Female,Education,3.796634,Instagram,56,6.937410,7.767076,6.895583,8.0,6.816069,0.234483,1,False,False,2,9,12.511871,7.854711


In [None]:
# Remove rows where the actual_productivity_score column is null
df = df.dropna(subset=['actual_productivity_score']).copy()

In [None]:
# Create a new column to identify high (1) or low (0) productivity level
median_score = df['actual_productivity_score'].median()
df['productivity_label'] = (df['actual_productivity_score'] > median_score).astype(int)

In [None]:
# Final Dataset
df

Unnamed: 0,age,gender,job_type,daily_social_media_time,social_platform_preference,number_of_notifications,work_hours_per_day,perceived_productivity_score,actual_productivity_score,stress_level,sleep_hours,screen_time_before_sleep,breaks_during_work,uses_focus_apps,has_digital_wellbeing_enabled,coffee_consumption_per_day,days_feeling_burnout_per_month,weekly_offline_hours,job_satisfaction_score,productivity_label
0,56.0,Male,Unemployed,4.180940,Facebook,61.0,6.753558,8.040464,7.291555,4.0,5.116546,0.419102,8.0,False,False,4.0,11.0,21.927072,6.336688,1
1,46.0,Male,Health,3.249603,Twitter,59.0,9.169296,5.063368,5.165093,7.0,5.103897,0.671519,7.0,True,True,2.0,25.0,0.000000,3.412427,1
2,32.0,Male,Finance,3.028026,Twitter,57.0,7.910952,3.861762,3.474053,4.0,8.583222,0.624378,0.0,True,False,3.0,17.0,10.322044,2.474944,0
3,60.0,Female,Unemployed,3.028026,Facebook,59.0,6.355027,2.916331,1.774869,6.0,6.052984,1.204540,1.0,False,False,0.0,4.0,23.876616,1.733670,0
5,38.0,Male,Finance,1.512568,Twitter,50.0,6.429312,5.522335,4.081026,5.0,5.515251,1.518612,5.0,False,True,5.0,2.0,0.000000,4.568728,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,34.0,Female,Health,1.877297,Facebook,59.0,10.226358,3.348512,3.465815,8.0,5.480462,1.412655,9.0,False,False,4.0,5.0,21.776927,4.944723,0
29996,39.0,Male,Health,4.437784,Instagram,46.0,4.692862,8.133213,6.659294,8.0,3.045393,0.148936,3.0,False,False,1.0,29.0,4.111370,6.155613,1
29997,42.0,Male,Education,17.724981,TikTok,64.0,10.915036,8.611005,8.658912,5.0,5.491520,1.224296,10.0,False,False,1.0,2.0,1.888315,6.285237,1
29998,20.0,Female,Education,3.796634,Instagram,56.0,6.937410,7.767076,6.895583,8.0,6.816069,0.234483,1.0,False,False,2.0,9.0,12.511871,7.854711,1


# Data Preprocessing

In [None]:
# Seperate columns by numeric, categorical, and boolean
numeric_cols     = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols.remove('actual_productivity_score')
numeric_cols.remove('productivity_label')
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
boolean_cols     = df.select_dtypes(include=['bool']).columns.tolist()

In [None]:
# Input median value to the missing values in numerical columns
imputer = SimpleImputer(strategy='median')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [None]:
# Convert boolean columns to 1/0
bools_df = df[boolean_cols].astype(int)

In [None]:
# One-hot encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat = encoder.fit_transform(df[categorical_cols])
encoded_cat_df = pd.DataFrame(
    encoded_cat,
    columns=encoder.get_feature_names_out(categorical_cols),
    index=df.index
)

In [None]:
# Normalize numerical values using Z-Score Standardization
scaler = StandardScaler()
scaled_nums = scaler.fit_transform(df[numeric_cols])
scaled_nums_df = pd.DataFrame(scaled_nums, columns=numeric_cols, index=df.index)

In [None]:
# Combine the final preprocessed data
features = pd.concat([scaled_nums_df, bools_df, encoded_cat_df],axis=1)
labels   = df['productivity_label']

In [None]:
# Final preprocessed data
features

Unnamed: 0,age,daily_social_media_time,number_of_notifications,work_hours_per_day,perceived_productivity_score,stress_level,sleep_hours,screen_time_before_sleep,breaks_during_work,coffee_consumption_per_day,...,job_type_Finance,job_type_Health,job_type_IT,job_type_Student,job_type_Unemployed,social_platform_preference_Facebook,social_platform_preference_Instagram,social_platform_preference_Telegram,social_platform_preference_TikTok,social_platform_preference_Twitter
0,1.049028,0.543356,0.135405,-0.118185,1.286565,-0.558910,-0.992073,-0.961602,0.948789,1.416122,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,0.326566,0.072634,-0.123443,1.090402,-0.225762,0.521773,-1.001142,-0.561249,0.633784,-0.002207,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.684882,-0.039357,-0.382291,0.460856,-0.836163,-0.558910,1.493340,-0.636019,-1.571254,0.706958,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.338014,-0.039357,-0.123443,-0.317569,-1.316431,0.161545,-0.320699,0.284164,-1.256248,-1.420536,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,-0.251404,-0.805309,-1.288260,-0.280404,0.007387,-0.198683,-0.706224,0.782306,0.003773,2.125286,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,-0.540390,-0.620965,-0.123443,1.619247,-1.096888,0.882001,-0.731166,0.614251,1.263794,1.416122,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
29996,-0.179158,0.673172,-1.805956,-1.149146,1.333681,0.882001,-2.476974,-1.390106,-0.626238,-0.711371,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29997,0.037581,7.388872,0.523677,1.963791,1.576393,-0.198683,-0.723237,0.315499,1.578800,-0.711371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
29998,-1.551837,0.349118,-0.511715,-0.026205,1.147688,0.882001,0.226390,-1.254423,-1.256248,-0.002207,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [None]:
# Split training and testing data (90:10)
X_train, X_test, y_train, y_test = train_test_split(
    features, labels,
    test_size=0.10, random_state=42
)

print(f"Number of training sample: ", len(X_train))
print(f"Number of test     sample: ", len(X_test))

Number of training sample:  24871
Number of test     sample:  2764


# FNN/MLP Model

In [None]:
# Initialize and train the Feedforward Neural Network
mlp = MLPClassifier(
    solver='sgd',
    learning_rate='adaptive',
    learning_rate_init=0.01,
    hidden_layer_sizes=(10,),
    activation='relu',
    alpha=1e-4,
    max_iter=2000,
    random_state=42
)
mlp.fit(X_train, y_train)

# Predict on the test set
y_pred_mlp = mlp.predict(X_test)

# Evaluate the model
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_mlp).ravel()
acc  = accuracy_score(y_test, y_pred_mlp)
prec = precision_score(y_test, y_pred_mlp)
rec  = recall_score(y_test, y_pred_mlp)
f1   = f1_score(y_test, y_pred_mlp)

print(f"Confusion Matrix -> TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")
print(f"Accuracy  : {acc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"F1‐Score  : {f1:.4f}")

Confusion Matrix -> TP: 1269, FP: 85, FN: 92, TN: 1318
Accuracy  : 0.9360
Precision : 0.9372
Recall    : 0.9324
F1‐Score  : 0.9348


In [None]:
# Print Result
result_mlp = pd.DataFrame({
    'Productivity Level': y_test,
    'FNN/MLP Prediction': y_pred_mlp
})
result_mlp = result_mlp.reset_index(drop=True)
result_mlp

Unnamed: 0,Productivity Level,FNN/MLP Prediction
0,1,1
1,0,0
2,1,1
3,0,0
4,1,0
...,...,...
2759,1,1
2760,1,1
2761,1,1
2762,0,0


# TabNet

In [None]:
# Convert training/testing sets to float32 NumPy arrays
X_train_np = X_train.values.astype(np.float32)
X_test_np  = X_test.values.astype(np.float32)
y_train_np = y_train.values.reshape(-1,).astype(np.int64)
y_test_np  = y_test.values.reshape(-1,).astype(np.int64)

# Initialize and train the TabNet
tabnet_clf = TabNetClassifier(
    n_d=8,
    n_a=8,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='sparsemax'
)
tabnet_clf.fit(
    X_train=X_train_np, y_train=y_train_np,
    eval_set=[(X_train_np, y_train_np), (X_test_np, y_test_np)],
    eval_name=['train','test'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Predict on the test set
y_pred_tab = tabnet_clf.predict(X_test_np).reshape(-1)

# Evaluate the model
tn_tab, fp_tab, fn_tab, tp_tab = confusion_matrix(y_test_np, y_pred_tab).ravel()
acc_tab   = accuracy_score(y_test_np, y_pred_tab)
prec_tab  = precision_score(y_test_np, y_pred_tab)
rec_tab   = recall_score(y_test_np, y_pred_tab)
f1_tab    = f1_score(y_test_np, y_pred_tab)

print(f"Confusion Matrix → TP: {tp_tab}, FP: {fp_tab}, FN: {fn_tab}, TN: {tn_tab}")
print(f"Accuracy  : {acc_tab:.4f}")
print(f"Precision : {prec_tab:.4f}")
print(f"Recall    : {rec_tab:.4f}")
print(f"F1‐Score  : {f1_tab:.4f}")



epoch 0  | loss: 0.45556 | train_accuracy: 0.90455 | test_accuracy: 0.90557 |  0:00:06s
epoch 1  | loss: 0.18779 | train_accuracy: 0.92586 | test_accuracy: 0.92583 |  0:00:12s
epoch 2  | loss: 0.16567 | train_accuracy: 0.9261  | test_accuracy: 0.92511 |  0:00:18s
epoch 3  | loss: 0.16338 | train_accuracy: 0.93462 | test_accuracy: 0.93126 |  0:00:23s
epoch 4  | loss: 0.16056 | train_accuracy: 0.9335  | test_accuracy: 0.93379 |  0:00:29s
epoch 5  | loss: 0.16649 | train_accuracy: 0.93185 | test_accuracy: 0.93162 |  0:00:35s
epoch 6  | loss: 0.16005 | train_accuracy: 0.93112 | test_accuracy: 0.92909 |  0:00:40s
epoch 7  | loss: 0.15735 | train_accuracy: 0.93506 | test_accuracy: 0.93596 |  0:00:46s
epoch 8  | loss: 0.15627 | train_accuracy: 0.93663 | test_accuracy: 0.93343 |  0:00:52s
epoch 9  | loss: 0.1565  | train_accuracy: 0.93442 | test_accuracy: 0.93524 |  0:00:58s
epoch 10 | loss: 0.15072 | train_accuracy: 0.93237 | test_accuracy: 0.93017 |  0:01:03s
epoch 11 | loss: 0.15642 | train



Confusion Matrix → TP: 1259, FP: 72, FN: 102, TN: 1331
Accuracy  : 0.9370
Precision : 0.9459
Recall    : 0.9251
F1‐Score  : 0.9354


In [None]:
# Print Result
result_tab = pd.DataFrame({
    'Productivity Level': y_test,
    'TabNet Prediction': y_pred_tab
})
result_tab = result_tab.reset_index(drop=True)
result_tab

Unnamed: 0,Productivity Level,TabNet Prediction
0,1,1
1,0,0
2,1,1
3,0,0
4,1,0
...,...,...
2759,1,1
2760,1,1
2761,1,1
2762,0,0
