In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# ~~~ PREPROCESSING ---
df = pd.read_csv("social_media_vs_productivity.csv")

# CONVERTING BOOLEANS TO NUMERIC
df['uses_focus_apps'] = df['uses_focus_apps'].astype(int)
df['has_digital_wellbeing_enabled'] = df['has_digital_wellbeing_enabled'].astype(int)

# Checking for Outliers using IQR method , ADD ANY NEW NUMERICAL FEATURES HERE!

features_to_check = [
    'daily_social_media_time',
    'sleep_hours',
    'stress_level',
    'number_of_notifications',
    'age'
]

def remove_outliers_iqr(data, features):
    verified_cleaned_data = data.copy()
    for column in features:
        Q1 = verified_cleaned_data[column].quantile(0.25)
        Q3 = verified_cleaned_data[column].quantile(0.75)
        IQR = Q3 - Q1
        lower = Q1 - 1.5 * IQR
        upper = Q3 + 1.5 * IQR
        verified_cleaned_data = verified_cleaned_data[(verified_cleaned_data[column] >= lower) & (verified_cleaned_data[column] <= upper)]
    return verified_cleaned_data

df_clean = remove_outliers_iqr(df, features_to_check)
df_clean = df_clean.interpolate()
df_clean = df_clean.dropna()
df_clean = pd.get_dummies(df_clean, columns=['job_type', 'gender'], drop_first=False) #if using categorical variable implement them here

data = df_clean.copy()

#Histogram
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
plt.subplots_adjust(left=0.1, right=1, top=0.9, bottom=0.1, wspace=0.5, hspace=0.5)

sns.histplot(data["daily_social_media_time"], bins=30, kde=True, color="goldenrod", ax=axes[0,0])
sns.histplot(data["actual_productivity_score"], bins=30, kde=True, color="darkorchid", ax=axes[0,1])
sns.histplot(data["sleep_hours"], bins=30, kde=True, color="lightskyblue", ax=axes[1,0])
sns.histplot(data["age"], bins=30, kde=True, color="indianred", ax=axes[1,1])

# Set plot labels and title
axes[0,0].set_title("Frequency of Participants' Media Time")
axes[0,0].set_xlabel("Daily Media Time of Participants")
axes[0,0].set_ylabel("Frequency")
axes[1,0].set_title("Frequency of Participants' Productivity Scores")
axes[1,0].set_xlabel("Participants' Productivity Scores")
axes[1,0].set_ylabel("Frequency")
axes[0,1].set_title("Frequency of Participants' Hours of Sleep")
axes[0,1].set_xlabel("Nightly Sleep Hours")
axes[0,1].set_ylabel("Frequency")
axes[1,1].set_title("Frequency of Participants' Ages")
axes[1,1].set_xlabel("Ages of Participants")
axes[1,1].set_ylabel("Frequency")

fig.supylabel("Frequencies")
fig.supxlabel("Independent Variables")
fig.suptitle("Frequencies of Participant Features")
plt.savefig('Histogram[Seaborn].png')
plt.show()

# Bar Chart
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
plt.subplots_adjust(left=0.1, right=1, top=0.9, bottom=0.1, wspace=0.4, hspace=0.4)

sns.barplot(x="social_platform_preference", y="daily_social_media_time", data=data, palette='rocket', ax=ax[0,0])
ax[0,0].set_title("Daily Media Time")
ax[0,0].set_xlabel("Popular Social Media Platforms")
ax[0,0].set_ylabel("Media Time Based on Social Platform")


sns.barplot(x="social_platform_preference", y="stress_level", data=data, palette='flare', ax=ax[0,1])
ax[0,1].set_title("Stress Levels")
ax[0,1].set_xlabel("Popular Social Media Platforms")
ax[0,1].set_ylabel("Stress Level Based on Social Platform")


sns.barplot(x="job_type", y="daily_social_media_time", data=data, palette='mako', ax=ax[1,0])
ax[1,0].set_title("Daily Media Time")
ax[1,0].set_xlabel("Participants' Job Titles")
ax[1,0].set_ylabel("Media Time based on Occupations")


sns.barplot(x="job_type", y="stress_level", data=data, palette='crest', ax=ax[1,1])
ax[1,1].set_title("Stress Level")
ax[1,1].set_xlabel("Participants' Job Titles")
ax[1,1].set_ylabel("Stress Level Based on Occupations")

fig.supylabel("Media Time & Stress Levels")
fig.supxlabel("Categorical Values")
fig.suptitle("Correlation to Media Time & Stress to Occupations and Social Platforms")

plt.savefig('BarChart[Seaborn].png')
plt.show()


# Box Plot
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
plt.subplots_adjust(left=0.1, right=1, top=0.9, bottom=0.1, wspace=0.5, hspace=0.5)

sns.boxplot(data=df, x="job_type", y="actual_productivity_score", palette="BuGn", ax=axes[0,0])
sns.boxplot(data=df, x="stress_level", y="actual_productivity_score", palette="PuBu", ax=axes[0,1])
sns.boxplot(data=df, x="gender", y="actual_productivity_score", palette="YlOrBr", ax=axes[1,0])
sns.boxplot(data=df, x="social_platform_preference", y="actual_productivity_score", palette="Reds", ax=axes[1,1])

# Set plot labels and title
axes[0,0].set_title("Productivity Based on Occupations")
axes[0,0].set_xlabel("Participants' Occupations")
axes[0,0].set_ylabel("Actual Productivity Score")
axes[0,1].set_title("Productivity Based on Stress Levels")
axes[0,1].set_xlabel("Participants' Stress Levels")
axes[0,1].set_ylabel("Actual Productivity Score")
axes[1,0].set_title("Productivity Based on Gender")
axes[1,0].set_xlabel("Participants' Genders")
axes[1,0].set_ylabel("Actual Productivity Score")
axes[1,1].set_title("Productivity Based on Social Platforms")
axes[1,1].set_xlabel("Participants' Platform Preferences")
axes[1,1].set_ylabel("Actual Productivity Score")

fig.supylabel("Productivity Scores")
fig.supxlabel("Independent Variables")
fig.suptitle("Productivity Scores of Participant Features")
plt.savefig('BoxPlot[Seaborn].png')
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'social_media_vs_productivity.csv'