# Social Media Wellbeing Analysis




Project Goal: To understand the impact social media has on wellbeing 

Group 2 Authors: Anisa, Alona, Arpha, Kanyin, Lola, Mitesh, Volodymyr

Dataset: Socialmedia_dataset.csv (original) 

## Table of Contents

## 1. Setup & Imports

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy import stats

#### First step of EDA
1. Load data
2. Understand the data, data type and for any missing/duplicate values
3. Distribution and basic statistics (mean, median, standard dev.)
4. Save data for further analysis

   

In [4]:
df = pd.read_csv("../data/socialmedia_dataset_orginal.csv")
df

Unnamed: 0,User_ID,Age,Gender,Daily_Screen_Time(hrs),Sleep_Quality(1-10),Stress_Level(1-10),Days_Without_Social_Media,Exercise_Frequency(week),Social_Media_Platform,Happiness_Index(1-10)
0,U001,44,Male,3.1,7.0,6.0,2.0,5.0,Facebook,10.0
1,U002,30,Other,5.1,7.0,8.0,5.0,3.0,LinkedIn,10.0
2,U003,23,Other,7.4,6.0,7.0,1.0,3.0,YouTube,6.0
3,U004,36,Female,5.7,7.0,8.0,1.0,1.0,TikTok,8.0
4,U005,34,Female,7.0,4.0,7.0,5.0,1.0,X (Twitter),8.0
...,...,...,...,...,...,...,...,...,...,...
495,U496,23,Male,6.9,5.0,7.0,4.0,2.0,X (Twitter),10.0
496,U497,43,Female,5.6,7.0,6.0,5.0,2.0,Facebook,9.0
497,U498,41,Male,7.7,5.0,7.0,2.0,2.0,LinkedIn,8.0
498,U499,23,Male,4.2,9.0,7.0,0.0,2.0,Facebook,9.0


In [5]:
#checking first few rows to ensure data set has loaded correctly 
df.head()

Unnamed: 0,User_ID,Age,Gender,Daily_Screen_Time(hrs),Sleep_Quality(1-10),Stress_Level(1-10),Days_Without_Social_Media,Exercise_Frequency(week),Social_Media_Platform,Happiness_Index(1-10)
0,U001,44,Male,3.1,7.0,6.0,2.0,5.0,Facebook,10.0
1,U002,30,Other,5.1,7.0,8.0,5.0,3.0,LinkedIn,10.0
2,U003,23,Other,7.4,6.0,7.0,1.0,3.0,YouTube,6.0
3,U004,36,Female,5.7,7.0,8.0,1.0,1.0,TikTok,8.0
4,U005,34,Female,7.0,4.0,7.0,5.0,1.0,X (Twitter),8.0


In [6]:
#checking the data set as a whole and where we have any missing values. In this case, we do not have missing values.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   User_ID                    500 non-null    object 
 1   Age                        500 non-null    int64  
 2   Gender                     500 non-null    object 
 3   Daily_Screen_Time(hrs)     500 non-null    float64
 4   Sleep_Quality(1-10)        500 non-null    float64
 5   Stress_Level(1-10)         500 non-null    float64
 6   Days_Without_Social_Media  500 non-null    float64
 7   Exercise_Frequency(week)   500 non-null    float64
 8   Social_Media_Platform      500 non-null    object 
 9   Happiness_Index(1-10)      500 non-null    float64
dtypes: float64(6), int64(1), object(3)
memory usage: 39.2+ KB


In [7]:
# Checking for 'is not a number' value, no boolean returned to confirm no missing values.  
df.isna().sum()

User_ID                      0
Age                          0
Gender                       0
Daily_Screen_Time(hrs)       0
Sleep_Quality(1-10)          0
Stress_Level(1-10)           0
Days_Without_Social_Media    0
Exercise_Frequency(week)     0
Social_Media_Platform        0
Happiness_Index(1-10)        0
dtype: int64

In [8]:
df.duplicated()

0      False
1      False
2      False
3      False
4      False
       ...  
495    False
496    False
497    False
498    False
499    False
Length: 500, dtype: bool

In [9]:
#basic overview of distribution to check for standard deviation, mean, min and max values

df.describe()

Unnamed: 0,Age,Daily_Screen_Time(hrs),Sleep_Quality(1-10),Stress_Level(1-10),Days_Without_Social_Media,Exercise_Frequency(week),Happiness_Index(1-10)
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,32.988,5.53,6.304,6.618,3.134,2.448,8.376
std,9.960637,1.734877,1.529792,1.542996,1.858751,1.428067,1.524228
min,16.0,1.0,2.0,2.0,0.0,0.0,4.0
25%,24.0,4.3,5.0,6.0,2.0,1.0,7.0
50%,34.0,5.6,6.0,7.0,3.0,2.0,9.0
75%,41.0,6.7,7.0,8.0,5.0,3.0,10.0
max,49.0,10.8,10.0,10.0,9.0,7.0,10.0


In [10]:
#checking value counts for this chosen variable, just to provide some insight prior to further analysis
df["Happiness_Index(1-10)"].value_counts()


Happiness_Index(1-10)
10.0    162
8.0     106
9.0      94
7.0      76
6.0      39
5.0      16
4.0       7
Name: count, dtype: int64

In [11]:
#saving data after initial exploration and confirming no missing values or duplication 
df1= df.to_csv('socialmedia_clean.csv',index=False)