<a href="https://www.kaggle.com/code/mustafadagteki/sql-analyzing-students-mental-health?scriptVersionId=168191356" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

## Analyzing Students Mental Health

Does going to university in a different country affect your mental health? A Japanese international university surveyed its students in 2018 and published a study the following year that was approved by several ethical and regulatory boards.

The study found that international students have a higher risk of mental health difficulties than the general population, and that social connectedness (belonging to a social group) and acculturative stress (stress associated with joining a new culture) are predictive of depression.


Explore the `students` data using PostgreSQL to find out if you would come to a similar conclusion for international students and see if the length of stay is a contributing factor.

Here is a data description of the columns you may find helpful.

| Field Name    | Description                                      |
| ------------- | ------------------------------------------------ |
| `inter_dom`     | Types of students (international or domestic)   |
| `japanese_cate` | Japanese language proficiency                    |
| `english_cate`  | English language proficiency                     |
| `academic`      | Current academic level (undergraduate or graduate) |
| `age`           | Current age of student                           |
| `stay`          | Current length of stay in years                  |
| `todep`         | Total score of depression (PHQ-9 test)           |
| `tosc`          | Total score of social connectedness (SCS test)   |
| `toas`          | Total score of acculturative stress (ASISS test) |

In [None]:
pip install duckdb


In [None]:
# import necessary libraries
# We will use duckdb for sql queries
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import seaborn as sns
import duckdb

In [None]:
# load data
students = pd.read_csv("/kaggle/input/portfolio-projects-data/students.csv")

In [None]:
query1 = """SELECT * 
            FROM students
            LIMIT 5;"""
result1 = duckdb.query(query1).to_df()
result1

In [None]:
query2 = """
		/* Checking for multiple null values */
		SELECT COUNT(*)
		FROM students
		WHERE inter_dom IS NULL
			AND region IS NULL
			AND GENDER IS NULL
			AND academic IS NULL;"""
result2 = duckdb.query(query2).to_df()
result2

In [None]:
# /* Find the distribution of students by Region, Gender and Academic */
query3 = """WITH region_total AS (
                SELECT 
                    region,
                    COUNT(*) AS Total_by_Region
                FROM students
                WHERE region IS NOT NULL AND gender IS NOT NULL AND academic IS NOT NULL
                GROUP BY region
            ),
            global_total AS (
                SELECT 
                    COUNT(*) AS Total_Global
                FROM students
                WHERE region IS NOT NULL AND gender IS NOT NULL AND academic IS NOT NULL
            )
            SELECT 
                s.region,
                CASE
                    WHEN (s.gender IS NULL AND s.academic IS NULL) THEN 'Total for Region'
                    WHEN (s.region IS NULL AND s.gender IS NULL AND s.academic IS NULL) THEN 'Global Total'
                    ELSE s.gender
                END AS gender,
                CASE 
                    WHEN s.academic IS NULL THEN 'All Academics'
                    ELSE s.academic 
                END AS academic,
                COUNT(*) AS Total_Students,
                ROUND(COUNT(*) * 100.0 / r.Total_by_Region, 2) AS Percentage_by_Region,
                ROUND(COUNT(*) * 100.0 / g.Total_Global, 2) AS Global_Percentage
            FROM students s
            JOIN region_total r ON s.region = r.region
            CROSS JOIN global_total g
            WHERE s.region IS NOT NULL AND s.gender IS NOT NULL AND s.academic IS NOT NULL
            GROUP BY ROLLUP (s.region, s.gender, s.academic), r.Total_by_Region, g.Total_Global
            ORDER BY s.region, s.gender, s.academic
            LIMIT 10;"""
result3 = duckdb.query(query3).to_df()
result3


In [None]:
query4 ="""/* Find the distribution of International and domestic students */
		SELECT 
			CASE
				WHEN inter_dom = 'Dom' THEN 'Domestic'
				WHEN inter_dom = 'Inter' THEN 'International' END AS origin,
			gender,
			COUNT(gender) AS no_of_students,
			ROUND(AVG(age),2) AS avg_age,
			ROUND(AVG(stay),2) AS avg_stay_year,
			ROUND(AVG(japanese),2) AS avg_japanese_lang,
			ROUND(AVG(english),2) AS avg_english_lang	
		FROM students
		WHERE inter_dom IS NOT NULL
		GROUP BY ROLLUP (inter_dom, gender)
		ORDER BY 1,2;"""

result4 = duckdb.query(query4).to_df()
result4

In [None]:
query5 ="""/* Find the distribution of International and domestic students */
			SELECT 
				CASE
					WHEN inter_dom = 'Dom' THEN 'Domestic'
					WHEN inter_dom = 'Inter' THEN 'International' END AS origin,
				gender,
				COUNT(gender) AS no_of_students,
				ROUND(AVG(todep),2) AS avg_score_of_depression,
				ROUND(AVG(tosc),2) AS avg_score_of_social_connect,
				ROUND(AVG(toas),2) AS avg_score_of_acculturative_stress
			FROM students
			WHERE inter_dom IS NOT NULL
			GROUP BY ROLLUP (inter_dom, gender)
			ORDER BY 1,2;"""
   
result5 = duckdb.query(query5).to_df()
result5

In [None]:
query6 = """/* Find the average scores by length of stay for international students, and view them in descending order */
               SELECT 
               stay,
               ROUND(AVG(todep), 2) AS average_phq, 
               ROUND(AVG(tosc), 2) AS average_scs, 
               ROUND(AVG(toas), 2) AS average_as
               FROM students
               WHERE inter_dom = 'Inter'
               GROUP BY stay
               ORDER BY stay DESC;"""

result6 = duckdb.query(query6).to_df()
result6

In [None]:
sns.set()
fig, axs = plt.subplots(1, 3, figsize=(18, 6)) 
sns.histplot(data=result6, x="stay", y="average_phq",color='purple', ax=axs[0])
axs[0].set_title('PHQ Histogram')
sns.histplot(data=result6, x="stay", y="average_scs",color='green', ax=axs[1])
axs[0].set_title('SCS Histogram')
sns.histplot(data=result6, x="stay", y="average_as",color='blue', ax=axs[2])
axs[2].set_title('AS Histogram')
plt.tight_layout()
plt.show()

In [None]:
sns.set(style="ticks")
sns.regplot(data=result6, x="stay", y="average_phq", color='purple')
sns.regplot(data=result6, x="stay", y="average_scs", color='green')
sns.regplot(data=result6, x="stay", y="average_as", color='blue')
legend_handles = [mlines.Line2D([], [], color='purple', label='PHQ'),
                  mlines.Line2D([], [], color='green', label='SCS'),
                  mlines.Line2D([], [], color='blue', label='AS')]
plt.legend(handles=legend_handles, loc='upper right')
plt.xlabel('Length of Stay')
plt.ylabel('Average Score')
plt.title('PHQ, SCS, AS by Length of Stay')
plt.tight_layout()
plt.show()