In [63]:
# importing needed programs
import pandas as pd
import heapq

In [9]:
# uploading dataset
df = pd.read_csv("anxiety_depression_data.csv")
df.head()

Unnamed: 0,Age,Gender,Education_Level,Employment_Status,Sleep_Hours,Physical_Activity_Hrs,Social_Support_Score,Anxiety_Score,Depression_Score,Stress_Level,...,Chronic_Illnesses,Medication_Use,Therapy,Meditation,Substance_Use,Financial_Stress,Work_Stress,Self_Esteem_Score,Life_Satisfaction_Score,Loneliness_Score
0,56,Male,Bachelor's,Unemployed,6.0,0.4,3,4,2,9,...,0,,0,1,,4,3,7,5,1
1,69,Female,Bachelor's,Retired,8.8,2.8,6,18,7,6,...,0,,1,0,,1,4,7,4,6
2,46,Female,Master's,Employed,5.3,1.6,5,5,13,8,...,0,,0,1,,8,7,8,1,1
3,32,Female,High School,Unemployed,8.8,0.5,4,6,3,4,...,1,,0,0,,7,4,8,4,4
4,60,Female,Bachelor's,Retired,7.2,0.7,2,7,15,3,...,0,,1,1,Frequent,8,9,5,7,7


In [10]:
# begin cleaning data
# checking for missing values
df.isnull().sum()

Age                                0
Gender                             0
Education_Level                    0
Employment_Status                  0
Sleep_Hours                        0
Physical_Activity_Hrs              0
Social_Support_Score               0
Anxiety_Score                      0
Depression_Score                   0
Stress_Level                       0
Family_History_Mental_Illness      0
Chronic_Illnesses                  0
Medication_Use                   747
Therapy                            0
Meditation                         0
Substance_Use                    834
Financial_Stress                   0
Work_Stress                        0
Self_Esteem_Score                  0
Life_Satisfaction_Score            0
Loneliness_Score                   0
dtype: int64

In [11]:
# dropping columns with null values and unrelated to research
df = df.drop(columns=['Medication_Use', 'Substance_Use', 'Family_History_Mental_Illness', 'Age', 'Gender', 'Therapy', 'Meditation'])
df.head()

Unnamed: 0,Education_Level,Employment_Status,Sleep_Hours,Physical_Activity_Hrs,Social_Support_Score,Anxiety_Score,Depression_Score,Stress_Level,Chronic_Illnesses,Financial_Stress,Work_Stress,Self_Esteem_Score,Life_Satisfaction_Score,Loneliness_Score
0,Bachelor's,Unemployed,6.0,0.4,3,4,2,9,0,4,3,7,5,1
1,Bachelor's,Retired,8.8,2.8,6,18,7,6,0,1,4,7,4,6
2,Master's,Employed,5.3,1.6,5,5,13,8,0,8,7,8,1,1
3,High School,Unemployed,8.8,0.5,4,6,3,4,1,7,4,8,4,4
4,Bachelor's,Retired,7.2,0.7,2,7,15,3,0,8,9,5,7,7


In [13]:
# checking for unique values to find typos
for col in df.columns:
    print(col)
    print(df[col].unique())
    print()

Education_Level
["Bachelor's" "Master's" 'High School' 'Other' 'PhD']

Employment_Status
['Unemployed' 'Retired' 'Employed' 'Student']

Sleep_Hours
[ 6.   8.8  5.3  7.2  4.4  7.1  3.3  7.4  7.8  5.6  6.7  9.4  5.7  4.9
  8.7  6.5  8.   8.5  5.4  5.8  8.3  9.   7.3  5.2  9.5  7.   6.6  2.7
  6.1  6.9  4.8  6.2  9.1  8.9  6.3  6.4  6.8  7.7  4.2  5.5  4.7  4.6
  5.9  5.1  3.8 10.5  5.   7.9  9.2  3.6  8.4  3.9  9.8  7.5  3.4  3.1
  2.4  8.6 12.4  7.6  9.3  4.   2.   4.1  8.2 10.1  4.5  8.1  2.3  3.7
  2.8  3.5  2.1  4.3  9.9 10.8  9.6 10.   9.7  2.6  3.2 10.3 10.2 11.4
  3. ]

Physical_Activity_Hrs
[ 0.4  2.8  1.6  0.5  0.7  1.3  6.8  2.5  6.3  1.1  4.6  1.5  0.6  1.2
  0.3  0.   0.1  7.6  2.2  3.7  2.6  1.8  2.9  7.4  1.7  0.8  2.1  3.4
  1.   3.9  4.1  0.2  3.   3.8  2.   6.1  5.3  1.9  7.   5.1  3.3  0.9
  5.4  1.4  2.7  7.2  3.6  4.   2.4  5.2  4.9  6.6  2.3  4.2  4.8  8.4
  4.3  3.5  5.5  5.7  5.6  3.2  8.1  7.5  5.9  7.7  7.1  8.9  4.4  8.2
  3.1  4.5  6.5  6.4 13.4  5.8 10.8  9.3 

In [18]:
# looking for outliers using IQR
for col in df.select_dtypes(include=['float', 'int']).columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    if not outliers.empty:
        print(col)
        print(outliers[col].values)
        print()

Sleep_Hours
[12.4  2.   2.1 10.8  2.1 11.4]

Physical_Activity_Hrs
[ 6.8  6.3  7.6  7.4  6.1  7.   7.2  6.8  7.   6.6  8.4  8.1  7.5  5.9
  7.7  6.8  7.1  8.9  8.2  6.5  6.4 13.4  6.8  5.9  6.5 10.8  7.6  9.3
  6.8  6.8  6.3  6.8  7.4  8.2  6.1  7.5  6.2  8.7 12.1  7.3  7.4  8.1
  7.2  6.6  6.3  8.3  6.9  9.6  7.8  8.5 14.7  6.7  7.6  6.7  8.   8.6
 11.2  6.4 10.9  6.1  8.3  6.8  8.8  6.7  5.9  6.1  6.5  6.6  7.7  6.
  6.   6.7  7.5  9.  15.1]



In [19]:
# viewing cleaned data summary
df.describe()

Unnamed: 0,Sleep_Hours,Physical_Activity_Hrs,Social_Support_Score,Anxiety_Score,Depression_Score,Stress_Level,Chronic_Illnesses,Financial_Stress,Work_Stress,Self_Esteem_Score,Life_Satisfaction_Score,Loneliness_Score
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
mean,6.469,2.00575,5.055,10.47,10.674167,5.000833,0.2675,4.9925,4.889167,5.0625,5.12,4.959167
std,1.52955,2.037818,2.652893,5.911138,5.632889,2.538281,0.44284,2.590953,2.547016,2.531587,2.56991,2.566383
min,2.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
25%,5.4,0.6,3.0,5.0,6.0,3.0,0.0,3.0,3.0,3.0,3.0,3.0
50%,6.4,1.4,5.0,10.5,11.0,5.0,0.0,5.0,5.0,5.0,5.0,5.0
75%,7.5,2.7,7.0,16.0,15.0,7.0,1.0,7.0,7.0,7.0,7.0,7.0
max,12.4,15.1,9.0,20.0,20.0,9.0,1.0,9.0,9.0,9.0,9.0,9.0


In [21]:
# storing data in lists for comparison
chronic_illness = df[df['Chronic_Illnesses'] == 1]
no_chronic_illness = df[df['Chronic_Illnesses'] == 0]
chronic_illness_list = chronic_illness.drop(columns='Chronic_Illnesses').values.tolist()
no_chronic_illness_list = no_chronic_illness.drop(columns='Chronic_Illnesses').values.tolist()

In [24]:
# viewing lists
print("With chronic illness:")
print(chronic_illness_list)

With chronic illness:
[['High School', 'Unemployed', 8.8, 0.5, 4, 6, 3, 4, 7, 4, 8, 4, 4], ["Master's", 'Unemployed', 3.3, 0.5, 4, 4, 10, 4, 9, 1, 4, 1, 4], ["Bachelor's", 'Student', 7.4, 6.8, 8, 1, 15, 8, 5, 4, 3, 1, 9], ['High School', 'Employed', 8.0, 0.3, 3, 13, 18, 5, 3, 8, 5, 4, 2], ['Other', 'Student', 5.8, 0.1, 2, 11, 13, 3, 9, 1, 9, 2, 3], ["Bachelor's", 'Unemployed', 6.0, 2.6, 7, 18, 3, 8, 5, 5, 6, 4, 3], ['Other', 'Employed', 6.1, 4.6, 8, 16, 17, 6, 9, 4, 3, 1, 1], ["Bachelor's", 'Employed', 4.8, 0.8, 8, 18, 7, 9, 2, 6, 9, 5, 5], ['High School', 'Employed', 9.1, 1.0, 2, 1, 16, 4, 1, 8, 8, 7, 1], ["Master's", 'Student', 6.8, 3.0, 2, 1, 20, 4, 2, 1, 7, 7, 2], ["Bachelor's", 'Employed', 4.8, 0.4, 8, 9, 16, 2, 7, 1, 3, 8, 4], ["Bachelor's", 'Employed', 7.8, 1.5, 2, 17, 14, 5, 1, 6, 4, 3, 1], ["Master's", 'Unemployed', 5.6, 2.0, 6, 15, 11, 2, 2, 6, 8, 7, 3], ['Other', 'Retired', 6.2, 3.9, 8, 1, 10, 3, 7, 3, 6, 5, 2], ['PhD', 'Student', 6.1, 0.6, 1, 18, 13, 4, 8, 2, 5, 7, 4], ["Ma

In [25]:
print("Without chronic illness:")
print(no_chronic_illness_list)

Without chronic illness:
[["Bachelor's", 'Unemployed', 6.0, 0.4, 3, 4, 2, 9, 4, 3, 7, 5, 1], ["Bachelor's", 'Retired', 8.8, 2.8, 6, 18, 7, 6, 1, 4, 7, 4, 6], ["Master's", 'Employed', 5.3, 1.6, 5, 5, 13, 8, 8, 7, 8, 1, 1], ["Bachelor's", 'Retired', 7.2, 0.7, 2, 7, 15, 3, 8, 9, 5, 7, 7], ['Other', 'Student', 4.4, 2.8, 7, 15, 1, 1, 1, 7, 1, 4, 6], ["Master's", 'Student', 7.1, 1.3, 5, 1, 3, 7, 5, 1, 4, 1, 8], ['High School', 'Employed', 7.8, 2.5, 9, 11, 6, 3, 3, 9, 8, 5, 8], ['Other', 'Retired', 5.6, 6.3, 9, 18, 20, 1, 9, 4, 3, 7, 5], ['High School', 'Employed', 6.7, 1.1, 3, 14, 15, 5, 9, 5, 5, 9, 6], ['PhD', 'Employed', 9.4, 1.3, 3, 11, 7, 2, 4, 1, 2, 8, 5], ['PhD', 'Student', 5.7, 4.6, 5, 10, 5, 3, 4, 2, 6, 2, 6], ["Bachelor's", 'Employed', 4.9, 1.5, 5, 17, 4, 6, 8, 4, 6, 9, 1], ["Bachelor's", 'Student', 8.7, 2.5, 2, 4, 18, 3, 7, 4, 8, 4, 4], ["Bachelor's", 'Retired', 9.4, 0.6, 1, 15, 3, 3, 5, 6, 8, 5, 9], ['PhD', 'Retired', 6.5, 1.2, 8, 16, 17, 6, 6, 6, 1, 5, 4], ['High School', 'Unempl

In [32]:
# creating heaps to compare stress and mood data
stress_cols = ['Anxiety_Score', 'Depression_Score', 'Stress_Level', 'Financial_Stress', 'Work_Stress', 'Loneliness_Score']
chronic_illness_heap = []
no_chronic_illness_heap = []
# loop to add scores together in heaps
for index, row in df.iterrows():
    stress_score = row[stress_cols].sum()
    if row['Chronic_Illnesses'] == 1:
        heapq.heappush(chronic_illness_heap, (-stress_score))
    else:
        heapq.heappush(no_chronic_illness_heap, (-stress_score))
print("Top 5 Highest Stress Scores with Chronic Illness:")
for _ in range(5):
    score = heapq.heappop(chronic_illness_heap)
    print(f"Stress score: {-score}")
print("Top 5 Highest Stress Scores without Chronic Illness:")
for _ in range(5):
    score = heapq.heappop(no_chronic_illness_heap)
    print(f" Stress score: {-score}")

Top 5 Highest Stress Scores with Chronic Illness:
Stress score: 64
Stress score: 61
Stress score: 60
Stress score: 60
Stress score: 60
Top 5 Highest Stress Scores without Chronic Illness:
 Stress score: 66
 Stress score: 66
 Stress score: 65
 Stress score: 65
 Stress score: 64


In [34]:
# creating heaps to compare quality of life data
quality_of_life_cols = ['Sleep_Hours', 'Physical_Activity_Hrs', 'Social_Support_Score', 'Self_Esteem_Score', 'Life_Satisfaction_Score']
chronic_illness_heap = []
no_chronic_illness_heap = []
# loop to add scores together in heaps
for index, row in df.iterrows():
    quality_of_life_score = row[quality_of_life_cols].sum()
    if row['Chronic_Illnesses'] == 1:
        heapq.heappush(chronic_illness_heap, (quality_of_life_score))
    else:
        heapq.heappush(no_chronic_illness_heap, (quality_of_life_score))
print("5 Lowest Quality of Life Scores with Chronic Illness:")
for _ in range(5):
    score = heapq.heappop(chronic_illness_heap)
    print(f"Quality of Life score: {score}")
print("5 Lowest Quality of Life Scores without Chronic Illness:")
for _ in range(5):
    score = heapq.heappop(no_chronic_illness_heap)
    print(f"Quality of Life score: {score}")

5 Lowest Quality of Life Scores with Chronic Illness:
Quality of Life score: 11.8
Quality of Life score: 12.6
Quality of Life score: 12.7
Quality of Life score: 12.8
Quality of Life score: 14.100000000000001
5 Lowest Quality of Life Scores without Chronic Illness:
Quality of Life score: 9.2
Quality of Life score: 10.1
Quality of Life score: 10.8
Quality of Life score: 11.6
Quality of Life score: 12.0


In [50]:
# getting columns to use for averages
print(df.columns.tolist())

['Education_Level', 'Employment_Status', 'Sleep_Hours', 'Physical_Activity_Hrs', 'Social_Support_Score', 'Anxiety_Score', 'Depression_Score', 'Stress_Level', 'Chronic_Illnesses', 'Financial_Stress', 'Work_Stress', 'Self_Esteem_Score', 'Life_Satisfaction_Score', 'Loneliness_Score']


In [62]:
# calculating averages using lists
column_names = ['Sleep_Hours', 'Physical_Activity_Hrs', 'Social_Support_Score', 'Anxiety_Score', 'Depression_Score', 'Stress_Level', 'Chronic_Illnesses', 'Financial_Stress', 'Work_Stress', 'Self_Esteem_Score', 'Life_Satisfaction_Score', 'Loneliness_Score']
def calculate_averages(data):
# exclude first two nonnumeric columns
    columns_to_use = [0] * (len(data[0]) - 2)
    num_rows = len(data)
# loop through rows
    for row in data:
        for i in range(2, len(row)):
            columns_to_use[i-2] += row[i]
#calculate average
    averages = [sum_value / num_rows for sum_value in columns_to_use]
    return averages
chronic_illness_average = calculate_averages(chronic_illness_list)
no_chronic_illness_average = calculate_averages(no_chronic_illness_list)
print("Averages with chronic illness:")
for col, avg in zip(column_names, chronic_illness_average):
    print(f"{col}: {avg:.2f}")
print("Averages without chronic illness:")
for col, avg in zip(column_names, no_chronic_illness_average):
    print(f"{col}: {avg:.2f}")

Averages with chronic illness:
Sleep_Hours: 6.33
Physical_Activity_Hrs: 2.03
Social_Support_Score: 4.98
Anxiety_Score: 10.52
Depression_Score: 10.93
Stress_Level: 4.91
Chronic_Illnesses: 4.90
Financial_Stress: 4.89
Work_Stress: 5.06
Self_Esteem_Score: 5.02
Life_Satisfaction_Score: 4.92
Averages without chronic illness:
Sleep_Hours: 6.52
Physical_Activity_Hrs: 2.00
Social_Support_Score: 5.08
Anxiety_Score: 10.45
Depression_Score: 10.58
Stress_Level: 5.03
Chronic_Illnesses: 5.03
Financial_Stress: 4.89
Work_Stress: 5.06
Self_Esteem_Score: 5.15
Life_Satisfaction_Score: 4.97


In [66]:
# finding mode using dictionary
def calculate_mode(data, column_index):
# create dictionary
    value_count = {}
# loop through columns to count values
    for entry in data:
        value = entry[column_index]
        if value in value_count:
            value_count[value] += 1
        else:
            value_count[value] = 1
# get most occuring value
    mode_value = None
    mode_count = 0
    for value, count in value_count.items():
        if count > mode_count:
            mode_value = value
            mode_count = count
    return mode_value, mode_count
# calculate modes
chronic_illness_education_mode, chronic_illness_education_count = calculate_mode(chronic_illness_list, 0)
chronic_illness_employment_mode, chronic_illness_employment_count = calculate_mode(chronic_illness_list, 1)
no_chronic_illness_education_mode, no_chronic_illness_education_count = calculate_mode(no_chronic_illness_list, 0)
no_chronic_illness_employment_mode, no_chronic_illness_employment_count = calculate_mode(no_chronic_illness_list, 1)
print(f"Education mode with chronic illness: {chronic_illness_education_mode}, {chronic_illness_education_count}")
print(f"Employment mode with chronic illness: {chronic_illness_employment_mode}, {chronic_illness_employment_count}")
print(f"Education mode without chronic illness: {no_chronic_illness_education_mode}, {no_chronic_illness_education_count}")
print(f"Employment mode without chronic illness: {no_chronic_illness_employment_mode}, {no_chronic_illness_employment_count}")

Education mode with chronic illness: High School, 71
Employment mode with chronic illness: Unemployed, 87
Education mode without chronic illness: PhD, 193
Employment mode without chronic illness: Employed, 238
