# Correlation between Social Media and Mental Health

-  Datasource: https://www.kaggle.com/code/souvikahmed071/correlation-between-sm-and-mental-health/notebook
-  Description of the dataset: 
    - Columns: Age, Gender, Relationship Status, Occupation Status, Affiliated Organizations, Social Medias used, time spent on socia media (in hours), 12 Likert scale based questions measuring either frequency or intensity of various aspects of mental health (1 being low frequency/intencity, 5 being high frequency/intensity). 
    - Nums of Observations: 481

### Import libraries, load and check the dataset

In [5]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## for building linear model
from sklearn.linear_model import LinearRegression

In [6]:
# load data & chehck shape of the data
dat = pd.read_csv("Resource/corr_socialMedia_mentalHealth.csv")
print(dat.shape) # 27 cols & 481 obs. 

FileNotFoundError: [Errno 2] No such file or directory: 'Resource/corr_socialMedia_mentalHealth.csv'

In [8]:
# check dataset
pd.set_option("display.max_columns", None)
dat.head(2)

NameError: name 'dat' is not defined

### Preprocessing & cleaning

In [9]:
# check missing values
dat.isna().any() # there was one variable that had true values -- affiliated Organizations

# total number of null values
dat.isnull().sum() # 30 null values for affliated organizations, thus choose to drop the rows with null vals

NameError: name 'dat' is not defined

In [10]:
# drop null values for affiliated organizations and created a new df (dat_dropNAs)
dat.dropna(inplace=True)

NameError: name 'dat' is not defined

In [11]:
# check to see if NAs are removed -- removed 30 obs that include NAs from affiliated organizations variable
dat.isna().any()
dat.shape # now obs. is 451

NameError: name 'dat' is not defined

In [12]:
# clean the column names 
dat.rename(columns = {'Timestamp':'timestamp',
                      '1. What is your age?':'age',
                      '2. Gender':'sex',
                      '3. Relationship Status':'relation_status',
                      '4. Occupation Status':'occupation',
                      '5. What type of organizations are you affiliated with?':'affiliations',
                      '6. Do you use social media?':'sm_user',
                      '7. What social media platforms do you commonly use?':'platforms',
                      '8. What is the average time you spend on social media every day?':'time_spent',
                      '9. How often do you find yourself using Social media without a specific purpose?':'ADHD1',
                      '10. How often do you get distracted by Social media when you are busy doing something?':'ADHD2',
                      "11. Do you feel restless if you haven't used Social media in a while?":'Anxiety1',
                      '12. On a scale of 1 to 5, how easily distracted are you?':'ADHD3',
                      '13. On a scale of 1 to 5, how much are you bothered by worries?':'Anxiety2',
                      '14. Do you find it difficult to concentrate on things?':'ADHD4',
                      '15. On a scale of 1-5, how often do you compare yourself to other successful people through the use of social media?':'Self_Esteem1',
                      '16. Following the previous question, how do you feel about these comparisons, generally speaking?':'Self_Esteem2',
                      '17. How often do you look to seek validation from features of social media?':'Self_Esteem3',
                      '18. How often do you feel depressed or down?':'Depression1',
                      '19. On a scale of 1 to 5, how frequently does your interest in daily activities fluctuate?':'Depression2',
                      '20. On a scale of 1 to 5, how often do you face issues regarding sleep?':'Depression3' },inplace=True)
cols=dat.columns.tolist()
cols

NameError: name 'dat' is not defined

In [13]:
# create a unique IDs for individual observation (for creating four different datsets of mental health conditions)
dat['ID'] = range(1, len(dat) + 1)
dat.head(2)

NameError: name 'dat' is not defined

In [14]:
# rearrange the cols
new_cols = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'ADHD1',
 'ADHD2',
 'ADHD3',
 'ADHD4',
 'Anxiety1',
 'Anxiety2',
 'Self_Esteem1',
 'Self_Esteem2',
 'Self_Esteem3',
 'Depression1',
 'Depression2',
 'Depression3']
dat = dat[new_cols]
dat.head(2)

NameError: name 'dat' is not defined

In [15]:
# check the dtype of variables & convert dtypes of variables
dat.info()

# given that the dtype of the timestamp variable is str in the dataset, convert the string vals to datetime object
dat['timestamp'] = pd.to_datetime(dat['timestamp'])

# convert dtype of age to integer
dat['age'] = dat['age'].astype(int)

# check dtypes of vars in the dataset
dat.info()

NameError: name 'dat' is not defined

In [16]:
# Check the range of age data
ageMin=dat['age'].min()
ageMax=dat['age'].max()
(ageMin,ageMax)

NameError: name 'dat' is not defined

In [17]:
# check sex data
dat['sex'].unique() # ['Male', 'Female', 'Nonbinary', 'Non-binary', 'unsure', 'Non binary', 'There are others???']

# drop obs. with 'There are others???' for sex
dat.drop(dat.loc[dat['sex'] =='There are others???'].index, inplace=True)

# convert the values with 'Nonbinary', 'Non-binary', 'unsure', 'Non binary' to 'others'
values_to_replace = ['Nonbinary', 'Non-binary', 'unsure', 'Non binary']

# Replace values with 'Others'
dat['sex'].replace(values_to_replace, 'Others', inplace=True)

NameError: name 'dat' is not defined

In [18]:
# check relation_status data  
dat['relation_status'].unique()

# replace "In a relationship" to "Relationship" to be consistent with other values (style-wise)
dat['relation_status'].replace('In a relationship','Relationship', inplace=True)

NameError: name 'dat' is not defined

In [19]:
# label encoding for time_spent
dat['time_spent'].unique()

category_mapping = {
    'Less than an Hour': '0-1 hour',
    'Between 1 and 2 hours': '1-2 hours',
    'Between 2 and 3 hours': '2-3 hours',
    'Between 3 and 4 hours': '3-4 hours',
    'Between 4 and 5 hours': '4-5 hours',
    'More than 5 hours': '5+ hours'
}

# Replace old categories with new categories
dat['time_spent'].replace(category_mapping, inplace=True)

dat['time_spent'].unique()

NameError: name 'dat' is not defined

In [20]:
# check the dataset again
dat.head(3)

NameError: name 'dat' is not defined

### Create 4 new csv files (ADHD, Anxiety,Self_Esteem, Depression) -- WARNING: this section should NOT be run again since new csv files were already created

In [96]:
# ADHD 
adhd_cols_to_keep = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'ADHD1',
 'ADHD2',
 'ADHD3',
 'ADHD4']

# Create a subset df with the selected columns
adhd = dat[adhd_cols_to_keep]

# Save the adhd df to a CSV file
adhd.to_csv('adhd.csv', index=False)

In [97]:
# Anxiety 
anxiety_cols_to_keep = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'Anxiety1',
 'Anxiety2']

# Create a subset df with the selected columns
anxiety = dat[anxiety_cols_to_keep]

# Save the adhd df to a CSV file
anxiety.to_csv('anxiety.csv', index=False)

In [98]:
# Self_Esteem 
selfEsteem_cols_to_keep = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'Self_Esteem1',
 'Self_Esteem2',
 'Self_Esteem3']

# Create a subset df with the selected columns
self_esteem = dat[selfEsteem_cols_to_keep]

# Save the adhd df to a CSV file
self_esteem.to_csv('self_esteem.csv', index=False)

In [99]:
# Depression 
depression_cols_to_keep = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'Depression1',
 'Depression2',
 'Depression3']

# Create a subset df with the selected columns
depression = dat[depression_cols_to_keep]

# Save the adhd df to a CSV file
depression.to_csv('depression.csv', index=False)

### Explorative Analysis

In [18]:
# copy the 'dat' dataframe to 'dat1'for the purpose of explorative analysis -- create cols for mean scores of each mental health condition
# (and also the 'dat' dataframe was used to create four new csv files, so creating a new dataframe 'dat1' for the new explorative analysis might be a good idea to reserve the original data)
dat1 = dat.copy()
dat1.head(2)

Unnamed: 0,ID,timestamp,age,sex,relation_status,occupation,affiliations,sm_user,platforms,time_spent,ADHD1,ADHD2,ADHD3,ADHD4,Anxiety1,Anxiety2,Self_Esteem1,Self_Esteem2,Self_Esteem3,Depression1,Depression2,Depression3
0,1,2022-04-18 19:18:00,21,Male,Relationship,University Student,University,Yes,"Facebook, Twitter, Instagram, YouTube, Discord...",2-3 hours,5,3,5,5,2,2,2,3,2,5,4,5
1,2,2022-04-18 19:19:00,21,Female,Single,University Student,University,Yes,"Facebook, Twitter, Instagram, YouTube, Discord...",5+ hours,4,3,4,4,2,5,5,1,1,5,4,5


In [19]:
# due to the scale rating issues ragarding question2 for Self_Esteem (see details here, https://www.kaggle.com/code/souvikahmed071/correlation-between-sm-and-mental-health/notebook)
# dropped the column of 'Self_Esteem2' from the dat1 df, before summing the scores for four conditions 
dat1.drop('Self_Esteem2', axis=1, inplace=True)

In [20]:
# create columns for summed scores of ADHD, Anxiety, Self-Esteem, and Depression for each obs.
groups = {
    'ADHD': ['ADHD1', 'ADHD2', 'ADHD3', 'ADHD4'],
    'Anxiety': ['Anxiety1', 'Anxiety2'],
    'SelfEsteem': ['Self_Esteem1', 'Self_Esteem3'],
    'Depression': ['Depression1', 'Depression2', 'Depression3']
}

# calculate the scores and add them to the 'dat1' df
for group_name, columns in groups.items():
    dat1[f'{group_name}_score'] = dat1[columns].sum(axis=1)

# Calculate the Total Score
dat1['total_score'] = dat1[[f'{group}_score' for group in groups]].sum(axis=1)

# check all the calculated scores
dat1.head(2)

Unnamed: 0,ID,timestamp,age,sex,relation_status,occupation,affiliations,sm_user,platforms,time_spent,ADHD1,ADHD2,ADHD3,ADHD4,Anxiety1,Anxiety2,Self_Esteem1,Self_Esteem3,Depression1,Depression2,Depression3,ADHD_score,Anxiety_score,SelfEsteem_score,Depression_score,total_score
0,1,2022-04-18 19:18:00,21,Male,Relationship,University Student,University,Yes,"Facebook, Twitter, Instagram, YouTube, Discord...",2-3 hours,5,3,5,5,2,2,2,2,5,4,5,18,4,4,14,40
1,2,2022-04-18 19:19:00,21,Female,Single,University Student,University,Yes,"Facebook, Twitter, Instagram, YouTube, Discord...",5+ hours,4,3,4,4,2,5,5,1,5,4,5,15,7,6,14,42


In [21]:
# create a new csv file 'dat1.csv' that includes additional data 
# (e.g., summed up scores for each mental health condition and total score)
dat1.to_csv('dat1.csv', index=False)

### Create 4 new csv files (ADHD, Anxiety,Self_Esteem, Depression) -- WARNING: this section should NOT be run again since new csv files were already created

In [22]:
# ADHD 
adhd_cols_to_keep = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'ADHD1',
 'ADHD2',
 'ADHD3',
 'ADHD4',
 'ADHD_score']

# Create a subset df with the selected columns
adhd = dat1[adhd_cols_to_keep]

# Save the adhd df to a CSV file
adhd.to_csv('adhd.csv', index=False)

In [23]:
# Anxiety 
anxiety_cols_to_keep = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'Anxiety1',
 'Anxiety2',
 'Anxiety_score']

# Create a subset df with the selected columns
anxiety = dat1[anxiety_cols_to_keep]

# Save the adhd df to a CSV file
anxiety.to_csv('anxiety.csv', index=False)

In [25]:
# Self_Esteem 
selfEsteem_cols_to_keep = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'Self_Esteem1',
 'Self_Esteem3',
 'SelfEsteem_score']

# Create a subset df with the selected columns
self_esteem = dat1[selfEsteem_cols_to_keep]

# Save the adhd df to a CSV file
self_esteem.to_csv('self_esteem.csv', index=False)

In [26]:
# Depression 
depression_cols_to_keep = ['ID',
 'timestamp',
 'age',
 'sex',
 'relation_status',
 'occupation',
 'affiliations',
 'sm_user',
 'platforms',
 'time_spent',
 'Depression1',
 'Depression2',
 'Depression3',
 'Depression_score']

# Create a subset df with the selected columns
depression = dat1[depression_cols_to_keep]

# Save the adhd df to a CSV file
depression.to_csv('depression.csv', index=False)

In [None]:
# install Pygal for interactive bar or box plots between time_spent (independent var) vs. other dependent variables
# time_spent vs. ADHD_score
# time_spent vs. Anxiety_score
# time_spent vs. SelfEsteem_score
# time_spent vs. Depression_score


In [7]:
# correlation plots



# A few examples to review monday 



# Correlation matrix
corr_matrix = dat1[['time_spent', 'ADHD_score', 'Anxiety_score', 'SelfEsteem_score', 'Depression_score']].corr()

# Plotting heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()


NameError: name 'dat1' is not defined

In [21]:
# Select relevant columns related to social media usage and mental health
selected_columns = ['ADHD1', 'ADHD2', 'ADHD3', 'ADHD4', 'Anxiety1', 'Anxiety2', 'Depression1', 'Depression2', 'Depression3']

# Create a subset DataFrame with selected columns
subset_data = dat1[selected_columns]

# Calculate the correlation matrix using Pearson correlation
correlation_matrix = subset_data.corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
labels = [label.replace(" ", "\n") for label in correlation_matrix.index]
plt.xticks(ticks=np.arange(len(labels)), labels=labels, rotation=0, ha='center')
plt.title('Correlation Between Social Media Usage and Mental Health')
plt.show()


#Notebook
#https://www.kaggle.com/code/gbiplesh/data-analysis-social-media-and-mental-health/notebook

NameError: name 'dat1' is not defined

In [None]:
# Select relevant columns related to social media usage and mental health
selected_columns = ['ADHD1', 'ADHD2', 'ADHD3', 'ADHD4', 'Anxiety1', 'Anxiety2', 'Depression1', 'Depression2', 'Depression3']

# Create a subset DataFrame with selected columns
subset_data = dat1[selected_columns]

# Calculate the correlation matrix using Pearson correlation
correlation_matrix = subset_data.corr()

# Create a heatmap to visualize the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
labels = [label.replace(" ", "\n") for label in selected_columns]
plt.xticks(ticks=np.arange(len(labels)), labels=labels, rotation=0, ha='center')
plt.yticks(ticks=np.arange(len(labels)), labels=labels, rotation=0, va='center')
plt.title('Pearson Correlation Coefficients')
plt.show()

# Calculate the p-values matrix
p_values_matrix = subset_data.corr(method=lambda x, y: st.pearsonr(x, y)[1])

# Create a heatmap to visualize the p-values
plt.figure(figsize=(10, 8))
sns.heatmap(p_values_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.xticks(ticks=np.arange(len(labels)), labels=labels, rotation=0, ha='center')
plt.yticks(ticks=np.arange(len(labels)), labels=labels, rotation=0, va='center')
plt.title('P-values')
plt.show()


#Chat-GTP for using correlation with seaborn

In [None]:
# linear regression models

In [None]:
# frequency of social media usage

# Question 7: What social media platforms do you commonly use?
platform_counts = dat['platforms'].str.split(', ', expand=True).stack().value_counts()
platform_counts.plot(kind='bar')
plt.title('Social Media Platform Usage')
plt.xlabel('Social Media Platform')
plt.ylabel('Count')
plt.show()

# Question 8: What is the average time you spend on social media every day?
# Convert time_spent to categorical data to ensure correct ordering
time_order = ['0-1 hour', '1-2 hours', '2-3 hours', '3-4 hours', '4-5 hours', '5+ hours']
dat['time_spent'] = pd.Categorical(dat['time_spent'], categories=time_order, ordered=True)

average_time_mean = dat['time_spent'].mean()
average_time_median = dat['time_spent'].median()
average_counts = dat['time_spent'].value_counts()
average_counts.plot(kind='barh')
plt.title('Average Time Spend in a Day')
plt.xlabel('Count')
plt.ylabel('Time Spent')
plt.show()

print(f"Mean Average Time: {average_time_mean} hours")
print(f"Median Average Time: {average_time_median} hours")


#Notebook 
#https://www.kaggle.com/code/gbiplesh/data-analysis-social-media-and-mental-health/notebook


In [None]:
# scatter plot link
#https://www.kaggle.com/code/serenahinton/social-media-and-mental-health-analysis

In [None]:
#last update to notebook
# I'll add jupyter notebooks for the other csv files to avoid being more messy then we can integrate 
# Katherine-pushing 4/6/2024 - 8:07pm