In [1]:
!pip install torch pytorch-tabnet --quiet
import pandas as pd
import kagglehub
import os
import torch
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score
)
from pytorch_tabnet.tab_model import TabNetClassifier

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m33.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m25.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Get dataset
path = kagglehub.dataset_download("mahdimashayekhi/social-media-vs-productivity")
csv_file = os.listdir(path)[0]
csv_file_path = os.path.join(path, csv_file)
df = pd.read_csv(csv_file_path)
df

Unnamed: 0,age,gender,job_type,daily_social_media_time,social_platform_preference,number_of_notifications,work_hours_per_day,perceived_productivity_score,actual_productivity_score,stress_level,sleep_hours,screen_time_before_sleep,breaks_during_work,uses_focus_apps,has_digital_wellbeing_enabled,coffee_consumption_per_day,days_feeling_burnout_per_month,weekly_offline_hours,job_satisfaction_score
0,56,Male,Unemployed,4.180940,Facebook,61,6.753558,8.040464,7.291555,4.0,5.116546,0.419102,8,False,False,4,11,21.927072,6.336688
1,46,Male,Health,3.249603,Twitter,59,9.169296,5.063368,5.165093,7.0,5.103897,0.671519,7,True,True,2,25,0.000000,3.412427
2,32,Male,Finance,,Twitter,57,7.910952,3.861762,3.474053,4.0,8.583222,0.624378,0,True,False,3,17,10.322044,2.474944
3,60,Female,Unemployed,,Facebook,59,6.355027,2.916331,1.774869,6.0,6.052984,1.204540,1,False,False,0,4,23.876616,1.733670
4,25,Male,IT,,Telegram,66,6.214096,8.868753,,7.0,5.405706,1.876254,1,False,True,1,30,10.653519,9.693060
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,34,Female,Health,1.877297,Facebook,59,10.226358,3.348512,3.465815,8.0,5.480462,1.412655,9,False,False,4,5,21.776927,
29996,39,Male,Health,4.437784,Instagram,46,4.692862,8.133213,6.659294,8.0,3.045393,0.148936,3,False,False,1,29,4.111370,6.155613
29997,42,Male,Education,17.724981,TikTok,64,10.915036,8.611005,8.658912,5.0,5.491520,1.224296,10,False,False,1,2,1.888315,6.285237
29998,20,Female,Education,3.796634,Instagram,56,6.937410,7.767076,6.895583,8.0,6.816069,0.234483,1,False,False,2,9,12.511871,7.854711


In [3]:
# Remove rows where the actual_productivity_score column is null
df = df.dropna(subset=['actual_productivity_score']).copy()

In [4]:
# Create a new column to identify high (1) or low (0) productivity level
median_score = df['actual_productivity_score'].median()
df['productivity_label'] = (df['actual_productivity_score'] > median_score).astype(int)

In [5]:
# Final Dataset
df

Unnamed: 0,age,gender,job_type,daily_social_media_time,social_platform_preference,number_of_notifications,work_hours_per_day,perceived_productivity_score,actual_productivity_score,stress_level,sleep_hours,screen_time_before_sleep,breaks_during_work,uses_focus_apps,has_digital_wellbeing_enabled,coffee_consumption_per_day,days_feeling_burnout_per_month,weekly_offline_hours,job_satisfaction_score,productivity_label
0,56,Male,Unemployed,4.180940,Facebook,61,6.753558,8.040464,7.291555,4.0,5.116546,0.419102,8,False,False,4,11,21.927072,6.336688,1
1,46,Male,Health,3.249603,Twitter,59,9.169296,5.063368,5.165093,7.0,5.103897,0.671519,7,True,True,2,25,0.000000,3.412427,1
2,32,Male,Finance,,Twitter,57,7.910952,3.861762,3.474053,4.0,8.583222,0.624378,0,True,False,3,17,10.322044,2.474944,0
3,60,Female,Unemployed,,Facebook,59,6.355027,2.916331,1.774869,6.0,6.052984,1.204540,1,False,False,0,4,23.876616,1.733670,0
5,38,Male,Finance,1.512568,Twitter,50,6.429312,,4.081026,5.0,5.515251,1.518612,5,False,True,5,2,0.000000,4.568728,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,34,Female,Health,1.877297,Facebook,59,10.226358,3.348512,3.465815,8.0,5.480462,1.412655,9,False,False,4,5,21.776927,,0
29996,39,Male,Health,4.437784,Instagram,46,4.692862,8.133213,6.659294,8.0,3.045393,0.148936,3,False,False,1,29,4.111370,6.155613,1
29997,42,Male,Education,17.724981,TikTok,64,10.915036,8.611005,8.658912,5.0,5.491520,1.224296,10,False,False,1,2,1.888315,6.285237,1
29998,20,Female,Education,3.796634,Instagram,56,6.937410,7.767076,6.895583,8.0,6.816069,0.234483,1,False,False,2,9,12.511871,7.854711,1


# Data Preprocessing

In [6]:
# Separate columns by numeric, categorical, and boolean
numeric_cols     = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols.remove('actual_productivity_score')
numeric_cols.remove('productivity_label')
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
boolean_cols     = df.select_dtypes(include=['bool']).columns.tolist()

In [7]:
# Input median value to the missing values in numerical columns
imputer = SimpleImputer(strategy='median')
df[numeric_cols] = imputer.fit_transform(df[numeric_cols])

In [8]:
# Convert boolean columns to 1/0
bools_df = df[boolean_cols].astype(int)

In [9]:
# One-hot encoding to categorical columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cat = encoder.fit_transform(df[categorical_cols])
encoded_cat_df = pd.DataFrame(
    encoded_cat,
    columns=encoder.get_feature_names_out(categorical_cols),
    index=df.index
)

In [10]:
# Normalize numerical values using Z-Score Standardization
scaler = StandardScaler()
scaled_nums = scaler.fit_transform(df[numeric_cols])
scaled_nums_df = pd.DataFrame(scaled_nums, columns=numeric_cols, index=df.index)

In [11]:
# Combine the final preprocessed data
features = pd.concat([scaled_nums_df, bools_df, encoded_cat_df],axis=1)
labels   = df['productivity_label']

In [12]:
# Final preprocessed data
features

Unnamed: 0,age,daily_social_media_time,number_of_notifications,work_hours_per_day,perceived_productivity_score,stress_level,sleep_hours,screen_time_before_sleep,breaks_during_work,coffee_consumption_per_day,...,job_type_Finance,job_type_Health,job_type_IT,job_type_Student,job_type_Unemployed,social_platform_preference_Facebook,social_platform_preference_Instagram,social_platform_preference_Telegram,social_platform_preference_TikTok,social_platform_preference_Twitter
0,1.049028,0.543356,0.135405,-0.118185,1.286565,-0.558910,-0.992073,-0.961602,0.948789,1.416122,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,0.326566,0.072634,-0.123443,1.090402,-0.225762,0.521773,-1.001142,-0.561249,0.633784,-0.002207,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.684882,-0.039357,-0.382291,0.460856,-0.836163,-0.558910,1.493340,-0.636019,-1.571254,0.706958,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.338014,-0.039357,-0.123443,-0.317569,-1.316431,0.161545,-0.320699,0.284164,-1.256248,-1.420536,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
5,-0.251404,-0.805309,-1.288260,-0.280404,0.007387,-0.198683,-0.706224,0.782306,0.003773,2.125286,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,-0.540390,-0.620965,-0.123443,1.619247,-1.096888,0.882001,-0.731166,0.614251,1.263794,1.416122,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
29996,-0.179158,0.673172,-1.805956,-1.149146,1.333681,0.882001,-2.476974,-1.390106,-0.626238,-0.711371,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
29997,0.037581,7.388872,0.523677,1.963791,1.576393,-0.198683,-0.723237,0.315499,1.578800,-0.711371,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
29998,-1.551837,0.349118,-0.511715,-0.026205,1.147688,0.882001,0.226390,-1.254423,-1.256248,-0.002207,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [13]:
# Split training and testing data (80:20)
X_train, X_test, y_train, y_test = train_test_split(
    features, labels,
    test_size=0.20, random_state=42
)

print(f"Number of training sample: ", len(X_train))
print(f"Number of test     sample: ", len(X_test))

Number of training sample:  22108
Number of test     sample:  5527


# FNN/MLP Model

In [14]:
# Initialize and train the Feedforward Neural Network
mlp = MLPClassifier(
    solver='sgd',
    learning_rate='adaptive',
    learning_rate_init=0.01,
    hidden_layer_sizes=(10,),
    activation='relu',
    alpha=1e-4,
    max_iter=2000,
    random_state=42
)
mlp.fit(X_train, y_train)

# Predict on the test set
y_pred_mlp = mlp.predict(X_test)

# Evaluate the model
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_mlp).ravel()
acc  = accuracy_score(y_test, y_pred_mlp)
prec = precision_score(y_test, y_pred_mlp)
rec  = recall_score(y_test, y_pred_mlp)
f1   = f1_score(y_test, y_pred_mlp)

print(f"Confusion Matrix -> TP: {tp}, FP: {fp}, FN: {fn}, TN: {tn}")
print(f"Accuracy  : {acc:.4f}")
print(f"Precision : {prec:.4f}")
print(f"Recall    : {rec:.4f}")
print(f"F1‐Score  : {f1:.4f}")

Confusion Matrix -> TP: 2556, FP: 160, FN: 195, TN: 2616
Accuracy  : 0.9358
Precision : 0.9411
Recall    : 0.9291
F1‐Score  : 0.9351


In [15]:
# Print Result
result_mlp = pd.DataFrame({
    'Productivity Level': y_test,
    'FNN/MLP Prediction': y_pred_mlp
})
result_mlp = result_mlp.reset_index(drop=True)
result_mlp

Unnamed: 0,Productivity Level,FNN/MLP Prediction
0,1,1
1,0,0
2,1,1
3,0,0
4,1,0
...,...,...
5522,1,0
5523,0,0
5524,0,0
5525,0,0


# TabNet

In [16]:
# Convert training/testing sets to float32 NumPy arrays
X_train_np = X_train.values.astype(np.float32)
X_test_np  = X_test.values.astype(np.float32)
y_train_np = y_train.values.reshape(-1,).astype(np.int64)
y_test_np  = y_test.values.reshape(-1,).astype(np.int64)

# Initialize and train the TabNet
tabnet_clf = TabNetClassifier(
    n_d=8,
    n_a=8,
    n_steps=5,
    gamma=1.5,
    lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='sparsemax'
)
tabnet_clf.fit(
    X_train=X_train_np, y_train=y_train_np,
    eval_set=[(X_train_np, y_train_np), (X_test_np, y_test_np)],
    eval_name=['train','test'],
    eval_metric=['accuracy'],
    max_epochs=100,
    patience=10,
    batch_size=256,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)

# Predict on the test set
y_pred_tab = tabnet_clf.predict(X_test_np).reshape(-1)

# Evaluate the model
tn_tab, fp_tab, fn_tab, tp_tab = confusion_matrix(y_test_np, y_pred_tab).ravel()
acc_tab   = accuracy_score(y_test_np, y_pred_tab)
prec_tab  = precision_score(y_test_np, y_pred_tab)
rec_tab   = recall_score(y_test_np, y_pred_tab)
f1_tab    = f1_score(y_test_np, y_pred_tab)

print(f"Confusion Matrix -> TP: {tp_tab}, FP: {fp_tab}, FN: {fn_tab}, TN: {tn_tab}")
print(f"Accuracy  : {acc_tab:.4f}")
print(f"Precision : {prec_tab:.4f}")
print(f"Recall    : {rec_tab:.4f}")
print(f"F1‐Score  : {f1_tab:.4f}")



epoch 0  | loss: 0.42371 | train_accuracy: 0.92844 | test_accuracy: 0.9269  |  0:00:12s
epoch 1  | loss: 0.17688 | train_accuracy: 0.93428 | test_accuracy: 0.93541 |  0:00:26s
epoch 2  | loss: 0.16139 | train_accuracy: 0.93423 | test_accuracy: 0.93885 |  0:00:31s
epoch 3  | loss: 0.16446 | train_accuracy: 0.9355  | test_accuracy: 0.93559 |  0:00:37s
epoch 4  | loss: 0.16063 | train_accuracy: 0.93057 | test_accuracy: 0.93233 |  0:00:42s
epoch 5  | loss: 0.16535 | train_accuracy: 0.93306 | test_accuracy: 0.93631 |  0:00:48s
epoch 6  | loss: 0.15852 | train_accuracy: 0.93477 | test_accuracy: 0.93848 |  0:00:53s
epoch 7  | loss: 0.16209 | train_accuracy: 0.93188 | test_accuracy: 0.93487 |  0:00:59s
epoch 8  | loss: 0.16494 | train_accuracy: 0.93505 | test_accuracy: 0.93613 |  0:01:04s
epoch 9  | loss: 0.16068 | train_accuracy: 0.93636 | test_accuracy: 0.93667 |  0:01:10s
epoch 10 | loss: 0.15814 | train_accuracy: 0.93536 | test_accuracy: 0.93595 |  0:01:15s
epoch 11 | loss: 0.15788 | train



Confusion Matrix -> TP: 2576, FP: 163, FN: 175, TN: 2613
Accuracy  : 0.9388
Precision : 0.9405
Recall    : 0.9364
F1‐Score  : 0.9384


In [17]:
# Print Result
result_tab = pd.DataFrame({
    'Productivity Level': y_test,
    'TabNet Prediction': y_pred_tab
})
result_tab = result_tab.reset_index(drop=True)
result_tab

Unnamed: 0,Productivity Level,TabNet Prediction
0,1,1
1,0,0
2,1,1
3,0,0
4,1,0
...,...,...
5522,1,1
5523,0,0
5524,0,0
5525,0,0


In [19]:
# Print full result
features_np = features.values.astype(np.float32)
labels_np = labels.values.reshape(-1,)

y_pred_all = tabnet_clf.predict(features_np).reshape(-1)
result_all = pd.DataFrame({
    'Productivity Level': labels_np,
    'TabNet Prediction': y_pred_all
}).reset_index(drop=True)

result_all

Unnamed: 0,Productivity Level,TabNet Prediction
0,1,1
1,1,0
2,0,0
3,0,0
4,0,0
...,...,...
27630,0,0
27631,1,1
27632,1,1
27633,1,1


In [31]:
# Calculate the percentage of correct predictions
correct_predictions = (result_all['Productivity Level'] == result_all['TabNet Prediction']).sum()
total_predictions = result_all.shape[0]
wrong_predictions = total_predictions - correct_predictions
print("Number of correct predictions: " + str(correct_predictions))
print("Number of wrong predictions: " + str(wrong_predictions))

accuracy_percentage = (correct_predictions / total_predictions) * 100
print("Correct predictions: " + str(accuracy_percentage) + "%")

Number of correct predictions: 25843
Number of wrong predictions: 1792
Correct predictions: 93.51546951329836%
