In [1]:
import pandas as pd
from scipy import stats
import numpy as np

file_path = "MentalHealthSurvey.csv"
df = pd.read_csv(file_path)
df.columns = df.columns.str.strip()

print("Data loaded and columns cleaned:")
print(df.info())

Data loaded and columns cleaned:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87 entries, 0 to 86
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   gender                    87 non-null     object
 1   age                       87 non-null     int64 
 2   university                87 non-null     object
 3   degree_level              87 non-null     object
 4   degree_major              87 non-null     object
 5   academic_year             87 non-null     object
 6   cgpa                      87 non-null     object
 7   residential_status        87 non-null     object
 8   campus_discrimination     87 non-null     object
 9   sports_engagement         87 non-null     object
 10  average_sleep             87 non-null     object
 11  study_satisfaction        87 non-null     int64 
 12  academic_workload         87 non-null     int64 
 13  academic_pressure         87 non-null     int64 


In [2]:
df_low_sleep = df[df['average_sleep'].isin(['4-6 hrs', '2-4 hrs'])]

high_concern = df_low_sleep[df_low_sleep['financial_concerns'] >= 4]['depression']
low_concern = df_low_sleep[df_low_sleep['financial_concerns'] <= 3]['depression']

print("\n--- Hypothesis 1: Low-Sleep Group Depression Comparison ---")
print(f"High Concern (n={len(high_concern)}): Mean Depression = {high_concern.mean():.2f}")
print(f"Low Concern (n={len(low_concern)}): Mean Depression = {low_concern.mean():.2f}")


--- Hypothesis 1: Low-Sleep Group Depression Comparison ---
High Concern (n=33): Mean Depression = 3.61
Low Concern (n=22): Mean Depression = 3.18


In [3]:
alpha = 0.05
# T-test (alternative='greater' for one-sided test)
t_stat_1, p_value_1 = stats.ttest_ind(high_concern, low_concern, equal_var=False, alternative='greater')

print(f"\nT-statistic: {t_stat_1:.3f}")
print(f"P-value: {p_value_1:.3f}")

if p_value_1 < alpha:
    print("Conclusion: Reject H0. The mean depression score is significantly higher for the high-concern group within the low-sleep subset.")
else:
    print("Conclusion: Fail to Reject H0. The difference is not statistically significant.")


T-statistic: 1.260
P-value: 0.107
Conclusion: Fail to Reject H0. The difference is not statistically significant.


In [4]:
df_year1 = df[df['academic_year'] == '1st year']
df_year4 = df[df['academic_year'] == '4th year']

corr1, p_corr1 = stats.pearsonr(df_year1['anxiety'], df_year1['isolation'])

corr4, p_corr4 = stats.pearsonr(df_year4['anxiety'], df_year4['isolation'])

print("\n--- Hypothesis 2: Anxiety-Isolation Correlation by Academic Year ---")
print(f"1st Year (n={len(df_year1)}): Correlation r = {corr1:.3f}, P-value = {p_corr1:.3f}")
print(f"4th Year (n={len(df_year4)}): Correlation r = {corr4:.3f}, P-value = {p_corr4:.3f}")


--- Hypothesis 2: Anxiety-Isolation Correlation by Academic Year ---
1st Year (n=34): Correlation r = 0.731, P-value = 0.000
4th Year (n=10): Correlation r = 0.424, P-value = 0.222


In [5]:
alpha = 0.05

print("\n--- Correlation Interpretation ---")
if p_corr1 < alpha and p_corr4 < alpha:
    print("Both correlations are statistically significant.")
    print(f"Absolute Correlation Difference: |r4 - r1| = {abs(corr4 - corr1):.3f}")
    if abs(corr4 - corr1) > 0.2:
        print("Insight: The strength of the relationship shows a notable change across academic years.")
    else:
        print("Insight: The relationship strength is similar across academic years.")
else:
    print("Conclusion: At least one group's correlation is not statistically significant.")


--- Correlation Interpretation ---
Conclusion: At least one group's correlation is not statistically significant.


In [6]:
df['uses_online_entertainment'] = df['stress_relief_activities'].apply(lambda x: 'Yes' if 'Online Entertainment' in x else 'No')

df['study_satisfaction_level'] = df['study_satisfaction'].apply(lambda x: 'High' if x >= 4 else 'Low')

contingency_table = pd.crosstab(df['study_satisfaction_level'], df['uses_online_entertainment'])

print("\n--- Hypothesis 3: Study Satisfaction vs Online Entertainment Contingency Table ---")
print(contingency_table)


--- Hypothesis 3: Study Satisfaction vs Online Entertainment Contingency Table ---
uses_online_entertainment  No  Yes
study_satisfaction_level          
High                       33   25
Low                        17   12


In [7]:
chi2, p_value_3, dof, expected = stats.chi2_contingency(contingency_table)

alpha = 0.05
print(f"\nChi-squared Statistic: {chi2:.3f}, Degrees of Freedom: {dof}, P-value: {p_value_3:.3f}")

if p_value_3 < alpha:
    print("Conclusion: Reject H0. There is a statistically significant association between the two variables.")
else:
    print("Conclusion: Fail to Reject H0. No statistically significant association was found.")


Chi-squared Statistic: 0.000, Degrees of Freedom: 1, P-value: 1.000
Conclusion: Fail to Reject H0. No statistically significant association was found.
