In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Preprocessing

1) Read the student_mental_health.csv file into a variable called smh and review the dataframe. <br>

In [13]:
smh = pd.read_csv("student_mental_health.csv")
smh.head()

Unnamed: 0,Timestamp,Choose your gender,Age,What is your course?,Your current year of Study,What is your CGPA?,Marital status,Do you have Depression?,Do you have Anxiety?,Do you have Panic attack?,Did you seek any specialist for a treatment?
0,8/7/2020 12:02,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,8/7/2020 12:04,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,8/7/2020 12:05,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,8/7/2020 12:06,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,8/7/2020 12:13,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


2) Drop the Timestamp column and change some column names: <br>
 ["Choose your gender", "What is your course?", "What is your CGPA", "Do you have Depression", "Do you have Anxiety?", "Do you have Panic attack?", "Did you seek any specialist for a treatment?"] to ["Sex", "Major", "GPA", "Depression", "Anxiety", "Panic_attack", "Treatment"]

In [14]:
smh.drop('Timestamp', axis=1, inplace = True) 
smh.head()

# parameter inplace = True will modify original dataframe smh, so when smh is called again, it will show that Timestamp is dropped.
# inplace = False does not modify the original dataframe df, so the result will have to be reassigned to smh, or to a new variable.

smh.rename(columns = {'Choose your gender': 'Sex', 'What is your course?': 'Major', 'What is your CGPA?': 'GPA','Do you have Depression?': 'Depression','Do you have Anxiety?':'Anxiety','Do you have Panic attack?':'Panic_attack','Did you seek any specialist for a treatment?': 'Treatment'}, inplace = True)
smh.head()

Unnamed: 0,Sex,Age,Major,Your current year of Study,GPA,Marital status,Depression,Anxiety,Panic_attack,Treatment
0,Female,18.0,Engineering,year 1,3.00 - 3.49,No,Yes,No,Yes,No
1,Male,21.0,Islamic education,year 2,3.00 - 3.49,No,No,Yes,No,No
2,Male,19.0,BIT,Year 1,3.00 - 3.49,No,Yes,Yes,Yes,No
3,Female,22.0,Laws,year 3,3.00 - 3.49,Yes,Yes,No,No,No
4,Male,23.0,Mathemathics,year 4,3.00 - 3.49,No,No,No,No,No


3) Convert the Columns with "Yes"/"No" entries into Boolean columns.

In [15]:
boolean_columns = ['Marital status','Depression','Anxiety','Panic_attack','Treatment']
for column in boolean_columns:
    smh[column] = smh[column].map({'Yes': True, 'No': False})

smh.head()


Unnamed: 0,Sex,Age,Major,Your current year of Study,GPA,Marital status,Depression,Anxiety,Panic_attack,Treatment
0,Female,18.0,Engineering,year 1,3.00 - 3.49,False,True,False,True,False
1,Male,21.0,Islamic education,year 2,3.00 - 3.49,False,False,True,False,False
2,Male,19.0,BIT,Year 1,3.00 - 3.49,False,True,True,True,False
3,Female,22.0,Laws,year 3,3.00 - 3.49,True,True,False,False,False
4,Male,23.0,Mathemathics,year 4,3.00 - 3.49,False,False,False,False,False


4) Converting the "Major" column to all uppercase for standardization.

In [16]:
# Uppercase all strings in column Major

smh['Major'] = smh['Major'].str.upper()

smh.to_csv('smh 2.csv', index=False)

smh.head()

Unnamed: 0,Sex,Age,Major,Your current year of Study,GPA,Marital status,Depression,Anxiety,Panic_attack,Treatment
0,Female,18.0,ENGINEERING,year 1,3.00 - 3.49,False,True,False,True,False
1,Male,21.0,ISLAMIC EDUCATION,year 2,3.00 - 3.49,False,False,True,False,False
2,Male,19.0,BIT,Year 1,3.00 - 3.49,False,True,True,True,False
3,Female,22.0,LAWS,year 3,3.00 - 3.49,True,True,False,False,False
4,Male,23.0,MATHEMATHICS,year 4,3.00 - 3.49,False,False,False,False,False


5) Create a new column called "STEM" in the smh dataframe which is True if the student's major is in 
['BCS',
 'ENGINEERING',
 'BIT',
 'PSYCHOLOGY',
 'IT',
 'MATHEMATICS',
 'BIOTECHNOLOGY']
or has the string "SCIENCE" in it, and False otherwise.

In [18]:

smh['Major'] = smh['Major'].replace('MATHEMATHICS', 'MATHEMATICS')  # to correct spelling error in the dataset

science_majors = ['BCS', 'ENGINEERING', 'BIT', 'PSYCHOLOGY', 'IT', 'MATHEMATICS', 'BIOTECHNOLOGY']

smh['STEM'] = smh['Major'].str.contains('SCIENCE') | smh['Major'].isin (science_majors)

STEM_students = smh['STEM'].sum()

print('The number of STEM students is',STEM_students)


The number of STEM students is 57


6) Create a new column called "Exhibits_disorder" in the smh dataframe which has a True value if any of the values "Depression", "Anxiety", or "Panic_attack" are True, and False otherwise.

In [19]:
smh['Exhibits_disorder'] = smh['Depression'] | smh['Anxiety'] | smh['Panic_attack']  

smh.head()

Unnamed: 0,Sex,Age,Major,Your current year of Study,GPA,Marital status,Depression,Anxiety,Panic_attack,Treatment,STEM,Exhibits_disorder
0,Female,18.0,ENGINEERING,year 1,3.00 - 3.49,False,True,False,True,False,True,True
1,Male,21.0,ISLAMIC EDUCATION,year 2,3.00 - 3.49,False,False,True,False,False,False,True
2,Male,19.0,BIT,Year 1,3.00 - 3.49,False,True,True,True,False,True,True
3,Female,22.0,LAWS,year 3,3.00 - 3.49,True,True,False,False,False,False,True
4,Male,23.0,MATHEMATICS,year 4,3.00 - 3.49,False,False,False,False,False,True,False


7) Create a new column called "N_disorder" in the smh dataframe which has the value 0, 1, 2, or 3 based on how many of "Depression", "Anxiety", or "Panic_attack" columns are True.

In [20]:
smh['N_disorder'] = (smh[['Depression','Anxiety', 'Panic_attack']].sum(axis = 1)).astype(int)

smh.head()

#smh.to_csv('smh.csv', index=False)

Unnamed: 0,Sex,Age,Major,Your current year of Study,GPA,Marital status,Depression,Anxiety,Panic_attack,Treatment,STEM,Exhibits_disorder,N_disorder
0,Female,18.0,ENGINEERING,year 1,3.00 - 3.49,False,True,False,True,False,True,True,2
1,Male,21.0,ISLAMIC EDUCATION,year 2,3.00 - 3.49,False,False,True,False,False,False,True,1
2,Male,19.0,BIT,Year 1,3.00 - 3.49,False,True,True,True,False,True,True,3
3,Female,22.0,LAWS,year 3,3.00 - 3.49,True,True,False,False,False,False,True,1
4,Male,23.0,MATHEMATICS,year 4,3.00 - 3.49,False,False,False,False,False,True,False,0


### Exploration

8) Determine how many students and questions they answered.

In [21]:
row_count, column_count = smh.shape

print ('Number of students is', row_count)
print ('Number of questions answered is', column_count - 3)  # minus 3 columns added

Number of students is 101
Number of questions answered is 10


9) How many unique majors are there?

In [22]:
unique_majors = smh['Major'].nunique()

print('Number of unique majors is', unique_majors)

Number of unique majors is 42


10) How many of each sex are there in the dataset? What percentage of the dataset are Females and Males? <br>

In [25]:
counts = smh['Sex'].value_counts()  # student count for each gender

total_students = len(smh['Sex'])

percentage = (counts/total_students)*100  #percentage of student in each gender

gender_stat = pd.DataFrame ({'Sex' : counts.index, 'Count' : counts, 'Percentage': percentage})  #create dataframe to display result

gender_stat.reset_index(drop=True, inplace=True)

gender_stat['Percentage'] = gender_stat['Percentage'].round(2)

gender_stat

Unnamed: 0,Sex,Count,Percentage
0,Female,75,74.26
1,Male,26,25.74


11) What percentage of students are married?
<br> 
What percentage of married students have depression?

In [26]:
married_percentage = smh['Marital status'].mean()*100

married_percentage = f"{married_percentage:.2f}%"

married = smh['Marital status'].sum()
married_depress = (smh['Depression'] & smh['Marital status']).sum()

m_d_percentage = f'{((married_depress/married) *100):.2f}%'


print('The percentage of students that are married is', married_percentage)
print ('The percentage of married students that have depression is',m_d_percentage)


The percentage of students that are married is 15.84%
The percentage of married students that have depression is 100.00%


12) What percentage of students are taking a STEM major? <br>
What percentage of students that are STEM majors exhbit a disorder? What's the mean for N_disorder?
<br>
What percentage of students that are non-STEM majors exhibit a disorder? What's the mean for N_disorder?
<br>
<br>
Is there a significant/statistical difference between these two groups?

In [27]:
STEM_percentage = smh['STEM'].mean()*100
STEM_percentage = f'{STEM_percentage:.2f}%'

STEM_student = smh['STEM'].sum()

STEM_disorder = (smh['Exhibits_disorder'] & smh['STEM']).sum()

S_d_percentage = (STEM_disorder/STEM_student) *100

S_d_percentage_format = f'{((STEM_disorder/STEM_student) *100):.2f}%'

STEM_disorder_count = smh[(smh['STEM'] == True) & (smh['Exhibits_disorder'] == True)]  # filter for STEM students with disorder

ave_STEM_disorder = STEM_disorder_count['N_disorder'].mean()

ave_STEM_disorder = f'{ave_STEM_disorder:.2f}'

#-------------------------------------------------------------------------------------------

NonS_d_percentage = f'{(100 - S_d_percentage):.2f}%'

NonSTEM_disorder_count = smh[(smh['STEM'] == False) & (smh['Exhibits_disorder'] == True)]

ave_NonSTEM_disorder = NonSTEM_disorder_count['N_disorder'].mean()

ave_NonSTEM_disorder =  f'{ave_NonSTEM_disorder:.2f}'

print('Percentage of students taking a STEM major is',STEM_percentage)
print('Percentage of STEM students that exhibit disorder is',S_d_percentage_format,'; these students have on average',ave_STEM_disorder,'disorders')

print('Percentage of non-STEM students that exhibit disorder is',NonS_d_percentage,'; these students have on average',ave_NonSTEM_disorder,'disorders')



Percentage of students taking a STEM major is 56.44%
Percentage of STEM students that exhibit disorder is 63.16% ; these students have on average 1.64 disorders
Percentage of non-STEM students that exhibit disorder is 36.84% ; these students have on average 1.54 disorders


13) What percentage of students exhibiting a disorder has sought treatment?

In [28]:
seek_treatment = (smh['Treatment'].mean())*100

seek_treatment = f'{seek_treatment:.2f}%' 

print ('The percentage of students with disorders that seek treatment is',seek_treatment)
print("This is a very low percentage, indicating that either students don't take these disorders serious enough, or not having resources/knowledge to seek treatment.")

The percentage of students with disorders that seek treatment is 5.94%
This is a very low percentage, indicating that either students don't take these disorders serious enough, or not having resources/knowledge to seek treatment.


14) For the Age column, find each of the following:
<ol>
    <li>Mean</li>
    <li>Mode</li>
    <li>Median</li>
    <li>Standard deviation</li>
</ol>

In [18]:
ave_age = smh['Age'].mean()
common_age = smh['Age'].mode()
mid_age = smh['Age'].median()
std_dev = smh['Age'].std()

age_stat = pd.DataFrame ({'Mean' : ave_age, 'Mode': common_age, 'Media': mid_age, 'Std.Dev': std_dev })

age_stat

Unnamed: 0,Mean,Mode,Media,Std.Dev
0,20.53,18.0,19.0,2.49628


15) Convert the GPA column into a numerical column. 
<ol>
    <li>For each bin, take the average between the two endpoints and replace the values in it.
        For example: "3.50 - 4.00" should have a value of 3.75 instead.
    </li>
    <li> Compute the correlation coefficient between this new numerical column and the N_disorder column.</li>
    
</ol>


In [19]:
#15.1
#splitting values in GPA with delimiter "-"
smh[['minGPA','maxGPA']]=smh['GPA'].str.split('-', expand = True) 

#converting the GPA columns to numeric
smh['minGPA'] = pd.to_numeric(smh['minGPA'])
smh['maxGPA'] = pd.to_numeric(smh['maxGPA'])

#calculate average and replace GPA
smh['GPA'] = (smh['minGPA']+smh['maxGPA'])/2

#drop the helper columns
smh = smh.drop(['minGPA', 'maxGPA'], axis=1)  

smh.head()

Unnamed: 0,Sex,Age,Major,Your current year of Study,GPA,Marital status,Depression,Anxiety,Panic_attack,Treatment,STEM,Exhibits_disorder,N_disorder
0,Female,18.0,ENGINEERING,year 1,3.245,False,True,False,True,False,True,True,2
1,Male,21.0,ISLAMIC EDUCATION,year 2,3.245,False,False,True,False,False,False,True,1
2,Male,19.0,BIT,Year 1,3.245,False,True,True,True,False,True,True,3
3,Female,22.0,LAWS,year 3,3.245,True,True,False,False,False,False,True,1
4,Male,23.0,MATHEMATICS,year 4,3.245,False,False,False,False,False,True,False,0


In [21]:
#15.2

corr_GPA_disorder = smh['GPA'].corr(smh['N_disorder'])
corr_GPA_disorder = f'{corr_GPA_disorder:.2f}'
print('The correlation coefficient of the GPA and number of disorder is',corr_GPA_disorder)
print('This means there is a week positive correlation between GPA and disorder.')

The correlation coefficient of the GPA and number of disorder is 0.13
This means there is a week positive correlation between GPA and disorder.
