Credits:

*Dataset* : [link](https://www.kaggle.com/datasets/muhammadroshaanriaz/time-wasters-on-social-media/data)

*Notebooks used as a reference* :
* [muhammadroshaanriaz](https://www.kaggle.com/code/muhammadroshaanriaz/eda-digital-time-sink-the-social-media-chronicles)
* [Devra AI](https://www.kaggle.com/code/devraai/social-media-usage-and-productivity-analysis)

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Steps:
1. EDA
2. Preprocessing
3. Model training
4. Model evaluation

In [None]:
data_url = "/kaggle/input/time-wasters-on-social-media/Time-Wasters on Social Media.csv"

In [None]:
df = pd.read_csv(data_url)

# 1. EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

## Distribution of age

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['Age'], bins=40, kde=True)
plt.title('Age Distribution')
plt.show()

## Gender distribution

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Gender')
plt.title('Gender Distribution')
plt.show()

## Distribution of location

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Location')
plt.title('Location Distribution')
plt.show()

## Income distribution

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['Income'], bins=40, kde=True)
plt.title('Income Distribution')
plt.show()

## Distribution of debt

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data = df, x = "Debt")
plt.title('Debt count')
plt.show()

## Property ownership

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Owns Property')
plt.title('Property Ownership')
plt.show()

## Distribution of platforms

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Platform')
plt.title('Platform Usage Distribution')
plt.show()

## Distribution of Frequency

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Frequency')
plt.title('Distribution of Frequency')
plt.show()

## Total Time Spent on Social Media

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['Total Time Spent'], bins=40, kde=True)
plt.title('Total Time Spent on Social Media')
plt.show()

### Bivariate Analysis

## Relationship between Total Time Spent and Productivity Loss

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=df, x='Total Time Spent', y='ProductivityLoss')
plt.title('Total Time Spent vs Productivity Loss')
plt.show()

## Satisfaction level based on the platform

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='Platform', y='Satisfaction')
plt.title('Satisfaction Levels by Platform')
plt.show()

## Correlation heatmap

In [None]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Plot the correlation heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

## Finding the biggest productivity loss by time of the day

In [None]:
frequency_productivity_loss = df.groupby('Frequency')['ProductivityLoss'].mean().reset_index()

In [None]:
# Sorting the values to find the biggest productivity loss
frequency_productivity_loss = frequency_productivity_loss.sort_values(by='ProductivityLoss', ascending=False)

In [None]:
# Displaying the results
print(frequency_productivity_loss)

# The results
plt.figure(figsize=(12,6))
plt.bar(frequency_productivity_loss['Frequency'], frequency_productivity_loss['ProductivityLoss'], color=['red', 'orange', 'yellow', 'green'])
plt.xlabel('Frequency')
plt.ylabel('Average Productivity Loss')
plt.title('Average Productivity Loss by Time of Day')
plt.show()

# Preprocessing


In [None]:
'''
columns_hand_picked = ['Age', 'Gender', 'Income', 'Owns Property', 'Profession', 'Demographics', 'Platform',
       'Total Time Spent', 'Frequency', 'ProductivityLoss']
columns_w_drops = df.columns.drop(['UserID', 'Debt', 'Satisfaction', 'Self Control', 'Addiction Level'])
all_columns = df.columns.drop(['UserID', "Debt"])

columns_types = [columns_hand_picked, columns_w_drops, all_columns]

y_columns = ["Debt"]
'''

In [None]:
X_int = df.select_dtypes(include=['int64'])
y_columns = ["Debt"]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
'''
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])] , remainder="passthrough")

X = [];

for columns_type in columns_types:
    X_tmp = df[columns_type];
    X_tmp = ct.fit_transform(pd.get_dummies(X_tmp))
    X.append(np.array(X_tmp))

y = df[y_columns]
'''

# Model training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
X = X_int
y = df[y_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [None]:
'''
for X_type in X:

    X_train, X_test, y_train, y_test = train_test_split(X_type, y, test_size=0.3, random_state=42)

    model = LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    accuracy
'''

# Model evaluation

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

accuracy