In [4]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'time-wasters-on-social-media:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5356905%2F8909196%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240821%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240821T111951Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D433de7d8ff78c09dcf17ec686528703dfa64adc2d74c62fdcd78a74eb44976ddb9ac3dbccd7f072896edd3408fe192f50d75a1ef0565e0ae027ce5d0d62405e2d18b8bebcf275bfc01d5d472e107c9ce7723764583db05240b60e3d9d2e516d2441eb48c13971cde007e6a898c44defe5fa444ca7a11695ab1e3d3edd3e31b27a4fb94586f86fa26550e805e5feeef861ce8aeebf0072b8ceded238f92bd1ca99367cf4add4ddb628ae433a4e269295e69163ea5cda2092cf687e4a768409d8f5f28f1a3da3bcdc0c05aa732d8231422022355ec2ec6c6ca57b72d164f5267d00846d844ef9212af0455123c029a12757cff8a8fe23d6c41de8bf6537102b776'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


PermissionError: [Errno 13] Permission denied: '/kaggle'

Credits:

*Dataset* : [link](https://www.kaggle.com/datasets/muhammadroshaanriaz/time-wasters-on-social-media/data)

*Notebooks used as a reference* :
* [muhammadroshaanriaz](https://www.kaggle.com/code/muhammadroshaanriaz/eda-digital-time-sink-the-social-media-chronicles)
* [Devra AI](https://www.kaggle.com/code/devraai/social-media-usage-and-productivity-analysis)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Steps:
1. EDA
2. Preprocessing
3. Model training
4. Model evaluation

In [None]:
data_url = "/kaggle/input/time-wasters-on-social-media/Time-Wasters on Social Media.csv"

In [None]:
df = pd.read_csv(data_url)

# 1. EDA

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

## Distribution of age

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['Age'], bins=40, kde=True)
plt.title('Age Distribution')
plt.show()

## Gender distribution

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Gender')
plt.title('Gender Distribution')
plt.show()

## Distribution of location

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Location')
plt.title('Location Distribution')
plt.show()

## Income distribution

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['Income'], bins=40, kde=True)
plt.title('Income Distribution')
plt.show()

## Distribution of debt

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data = df, x = "Debt")
plt.title('Debt count')
plt.show()

## Property ownership

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Owns Property')
plt.title('Property Ownership')
plt.show()

## Distribution of platforms

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Platform')
plt.title('Platform Usage Distribution')
plt.show()

## Distribution of Frequency

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(data=df, x='Frequency')
plt.title('Distribution of Frequency')
plt.show()

## Total Time Spent on Social Media

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df['Total Time Spent'], bins=40, kde=True)
plt.title('Total Time Spent on Social Media')
plt.show()

### Bivariate Analysis

## Relationship between Total Time Spent and Productivity Loss

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=df, x='Total Time Spent', y='ProductivityLoss')
plt.title('Total Time Spent vs Productivity Loss')
plt.show()

## Satisfaction level based on the platform

In [None]:
plt.figure(figsize=(12,6))
sns.boxplot(data=df, x='Platform', y='Satisfaction')
plt.title('Satisfaction Levels by Platform')
plt.show()

## Correlation heatmap

In [None]:
# Select only numeric columns
numeric_df = df.select_dtypes(include=[np.number])

# Plot the correlation heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(numeric_df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Heatmap')
plt.show()

## Finding the biggest productivity loss by time of the day

In [None]:
frequency_productivity_loss = df.groupby('Frequency')['ProductivityLoss'].mean().reset_index()

In [None]:
# Sorting the values to find the biggest productivity loss
frequency_productivity_loss = frequency_productivity_loss.sort_values(by='ProductivityLoss', ascending=False)

In [None]:
# Displaying the results
print(frequency_productivity_loss)

# The results
plt.figure(figsize=(12,6))
plt.bar(frequency_productivity_loss['Frequency'], frequency_productivity_loss['ProductivityLoss'], color=['red', 'orange', 'yellow', 'green'])
plt.xlabel('Frequency')
plt.ylabel('Average Productivity Loss')
plt.title('Average Productivity Loss by Time of Day')
plt.show()

# Preprocessing


In [None]:
'''
columns_hand_picked = ['Age', 'Gender', 'Income', 'Owns Property', 'Profession', 'Demographics', 'Platform',
       'Total Time Spent', 'Frequency', 'ProductivityLoss']
columns_w_drops = df.columns.drop(['UserID', 'Debt', 'Satisfaction', 'Self Control', 'Addiction Level'])
all_columns = df.columns.drop(['UserID', "Debt"])

columns_types = [columns_hand_picked, columns_w_drops, all_columns]

y_columns = ["Debt"]
'''

In [None]:
X_int = df.select_dtypes(include=['int64'])
y_columns = ["Debt"]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
'''
ct = ColumnTransformer(transformers=[("encoder", OneHotEncoder(), [0])] , remainder="passthrough")

X = [];

for columns_type in columns_types:
    X_tmp = df[columns_type];
    X_tmp = ct.fit_transform(pd.get_dummies(X_tmp))
    X.append(np.array(X_tmp))

y = df[y_columns]
'''

# Model training

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
X = X_int
y = df[y_columns]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

In [None]:
'''
for X_type in X:

    X_train, X_test, y_train, y_test = train_test_split(X_type, y, test_size=0.3, random_state=42)

    model = LogisticRegression(max_iter=10000)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    accuracy
'''

# Model evaluation

In [None]:
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)

accuracy