In [32]:
# Import dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
from scipy import stats
# import tensorflow as tf

In [33]:
# Import our mental health dataset and create a dataframe
mental_df = pd.read_csv('./database.csv')
mental_df.head()

Unnamed: 0,response_id,Volunteered For Arts/Culture/Heritage,Interested in Arts & Culture,Interested in Science & Technology,Interested in Research and independent study,Interested in Sports,"Interested in Other competitions (eg case, debates)",Interested in Entrepreneurship,Interested in Volunteering,Interested in Others,...,Housing Type,How many events have you Volunteered in ?,Q2-How many events have you Participated in ?,Q3-How many activities are you Interested in ?,Q4-How many activities are you Passionate about ?,Q5-What are your levels of stress ?,Q6-How Satisfied You are with your Student Life ?,Q7-How much effort do you make to interact with others ?,Q8-About How events are you aware about ?,Q9-What is an ideal student life ?
0,1,,No,Yes,Yes,No,Yes,No,No,No,...,Residences,0,1,3,1,1,2,2.0,2.0,
1,2,,Yes,No,No,Yes,No,No,No,No,...,Out of Campus,0,1,2,3,1,2,2.0,3.0,Friends+CCas+good result
2,3,No,No,No,No,Yes,No,No,No,No,...,Halls,3,1,1,5,2,2,2.0,2.0,just want everything to go smooth. serious
3,4,Yes,No,Yes,Yes,No,No,No,Yes,No,...,Halls,3,4,3,3,7,1,1.0,1.0,
4,5,No,No,Yes,Yes,No,No,Yes,Yes,No,...,Out of Campus,4,3,4,5,4,2,2.0,2.0,a mixture of both academic and non-academic


In [34]:
mental_df.shape

(2958, 35)

In [35]:
mental_df = mental_df.replace({'Yes': 1, 'No': -1,'Maybe':0})


In [36]:
mental_df

Unnamed: 0,response_id,Volunteered For Arts/Culture/Heritage,Interested in Arts & Culture,Interested in Science & Technology,Interested in Research and independent study,Interested in Sports,"Interested in Other competitions (eg case, debates)",Interested in Entrepreneurship,Interested in Volunteering,Interested in Others,...,Housing Type,How many events have you Volunteered in ?,Q2-How many events have you Participated in ?,Q3-How many activities are you Interested in ?,Q4-How many activities are you Passionate about ?,Q5-What are your levels of stress ?,Q6-How Satisfied You are with your Student Life ?,Q7-How much effort do you make to interact with others ?,Q8-About How events are you aware about ?,Q9-What is an ideal student life ?
0,1,,-1,1,1,-1,1,-1,-1,-1,...,Residences,0,1,3,1,1,2,2.0,2.0,
1,2,,1,-1,-1,1,-1,-1,-1,-1,...,Out of Campus,0,1,2,3,1,2,2.0,3.0,Friends+CCas+good result
2,3,-1.0,-1,-1,-1,1,-1,-1,-1,-1,...,Halls,3,1,1,5,2,2,2.0,2.0,just want everything to go smooth. serious
3,4,1.0,-1,1,1,-1,-1,-1,1,-1,...,Halls,3,4,3,3,7,1,1.0,1.0,
4,5,-1.0,-1,1,1,-1,-1,1,1,-1,...,Out of Campus,4,3,4,5,4,2,2.0,2.0,a mixture of both academic and non-academic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2953,2954,,1,1,1,1,-1,-1,1,-1,...,Out of Campus,0,1,5,8,3,2,,,The ideal student life is about able to balanc...
2954,2955,-1.0,-1,1,1,1,1,-1,1,-1,...,Out of Campus,1,1,5,2,1,2,,,Work-life balance with time for leisure activi...
2955,2956,,1,-1,-1,-1,-1,-1,-1,-1,...,Out of Campus,0,1,1,2,1,3,,,
2956,2957,-1.0,1,1,1,1,-1,1,1,-1,...,Out of Campus,3,2,6,2,3,3,,,


In [37]:
# Check for low response rate columns < 70% response rate
low_response_columns = []
for column in mental_df.columns:
    if ((mental_df[column].count() / 1433) < .7):
        low_response_columns.append(column)

low_response_columns

[]

In [38]:
# drop low response columns
mental_df.drop(columns=low_response_columns, inplace=True)
mental_df.shape

(2958, 35)

In [39]:
# Check for many distinct answers indicating long for response or irrelavant data
many_distinct_columns = []
for column in mental_df.columns:
    if (mental_df[column].nunique() > 10):
        many_distinct_columns.append(column)
    

many_distinct_columns.remove('Q9-What is an ideal student life ?')
many_distinct_columns.remove('Primary Programme')
#many_distinct_columns.remove('Career ')
#many_distinct_columns.remove('Housing Type')

many_distinct_columns 

['response_id',
 'Department',
 'How many events have you Volunteered in ?',
 'Q4-How many activities are you Passionate about ?']

In [40]:
# Drop 'Column1' and 'Column2' from the DataFrame
mental_df = mental_df.drop(['Q9-What is an ideal student life ?'], axis=1)

In [41]:
# drop many distinct answers columns
mental_df.drop(columns=many_distinct_columns, inplace=True)
mental_df.shape

(2958, 30)

In [42]:
# Fix gender data to be consistent
mental_df['Gender'] = mental_df['Gender'].replace('male', 'Male')
mental_df['Gender'] = mental_df['Gender'].replace('m', 'Male')
mental_df['Gender'] = mental_df['Gender'].replace('M', 'Male')
mental_df['Gender'] = mental_df['Gender'].replace('f', 'Male')
mental_df['Gender'] = mental_df['Gender'].replace('F', 'Female')
mental_df['Career']=mental_df['Career'].replace('undergraduate','UGRD')
mental_df['Career']=mental_df['Career'].replace('undergraduation','UGRD')
mental_df['Career']=mental_df['Career'].replace('graduate','GRAD')
mental_df['Career']=mental_df['Career'].replace('graduation','UGRD')


In [43]:
# Print out gender counts
gender_counts = mental_df['Gender'].value_counts()
gender_counts

Female    1899
Male      1059
Name: Gender, dtype: int64

In [44]:
# Determine which values to replace
replace_genders = list(gender_counts[gender_counts < 20].index)

# Replace in DataFrame
for gender in replace_genders:
    mental_df['Gender'] = mental_df['Gender'].replace(gender,"Other")


In [45]:
# Print out gender counts
gender_counts = mental_df['Gender'].value_counts()
gender_counts

Female    1899
Male      1059
Name: Gender, dtype: int64

In [48]:
mental_df

Unnamed: 0,Volunteered For Arts/Culture/Heritage,Interested in Arts & Culture,Interested in Science & Technology,Interested in Research and independent study,Interested in Sports,"Interested in Other competitions (eg case, debates)",Interested in Entrepreneurship,Interested in Volunteering,Interested in Others,Are you stressed about adjustment in new environment,...,Year of Study,Primary Programme,Gender,Housing Type,Q2-How many events have you Participated in ?,Q3-How many activities are you Interested in ?,Q5-What are your levels of stress ?,Q6-How Satisfied You are with your Student Life ?,Q7-How much effort do you make to interact with others ?,Q8-About How events are you aware about ?
0,,-1,1,1,-1,1,-1,-1,-1,-1,...,2,Bachelor of Science,Female,Residences,1,3,1,2,2.0,2.0
1,,1,-1,-1,1,-1,-1,-1,-1,-1,...,1,Bachelor of Engineering,Female,Out of Campus,1,2,1,2,2.0,3.0
2,-1.0,-1,-1,-1,1,-1,-1,-1,-1,-1,...,2,Bachelor of Science,Male,Halls,1,1,2,2,2.0,2.0
3,1.0,-1,1,1,-1,-1,-1,1,-1,1,...,2,Bachelor of Engineering,Male,Halls,4,3,7,1,1.0,1.0
4,-1.0,-1,1,1,-1,-1,1,1,-1,-1,...,3,Bachelor of Engineering,Female,Out of Campus,3,4,4,2,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2953,,1,1,1,1,-1,-1,1,-1,-1,...,1,Bachelor of Science,Female,Out of Campus,1,5,3,2,,
2954,-1.0,-1,1,1,1,1,-1,1,-1,-1,...,2,"Bach. of Med.,Bach. of Surg.",Male,Out of Campus,1,5,1,2,,
2955,,1,-1,-1,-1,-1,-1,-1,-1,-1,...,1,Master of Science (Track 1),Female,Out of Campus,1,1,1,3,,
2956,-1.0,1,1,1,1,-1,1,1,-1,-1,...,4,Doctor of Philosophy (FoE),Male,Out of Campus,2,6,3,3,,


In [49]:
mental_df.dropna(inplace=True)

In [50]:
mental_df.shape

(1678, 30)

In [51]:
mental_df.replace(",", "")

Unnamed: 0,Volunteered For Arts/Culture/Heritage,Interested in Arts & Culture,Interested in Science & Technology,Interested in Research and independent study,Interested in Sports,"Interested in Other competitions (eg case, debates)",Interested in Entrepreneurship,Interested in Volunteering,Interested in Others,Are you stressed about adjustment in new environment,...,Year of Study,Primary Programme,Gender,Housing Type,Q2-How many events have you Participated in ?,Q3-How many activities are you Interested in ?,Q5-What are your levels of stress ?,Q6-How Satisfied You are with your Student Life ?,Q7-How much effort do you make to interact with others ?,Q8-About How events are you aware about ?
2,-1.0,-1,-1,-1,1,-1,-1,-1,-1,-1,...,2,Bachelor of Science,Male,Halls,1,1,2,2,2.0,2.0
3,1.0,-1,1,1,-1,-1,-1,1,-1,1,...,2,Bachelor of Engineering,Male,Halls,4,3,7,1,1.0,1.0
4,-1.0,-1,1,1,-1,-1,1,1,-1,-1,...,3,Bachelor of Engineering,Female,Out of Campus,3,4,4,2,2.0,2.0
5,-1.0,1,1,-1,1,-1,-1,-1,-1,-1,...,1,Bachelor of Science,Female,Residential Colleges,2,3,2,3,3.0,1.0
6,-1.0,1,-1,-1,1,-1,1,1,-1,-1,...,3,Bachelor of Business Admin,Female,Out of Campus,2,4,3,2,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2927,1.0,1,1,1,1,-1,1,-1,-1,-1,...,3,Bachelor of Science,Male,Halls,3,5,1,2,2.0,2.0
2928,-1.0,-1,1,1,1,-1,-1,1,-1,1,...,3,Bachelor of Engineering,Male,Halls,3,4,5,2,2.0,2.0
2929,-1.0,1,-1,-1,-1,1,-1,1,-1,-1,...,4,Bachelor of Engineering,Male,Halls,1,3,1,2,2.0,2.0
2933,-1.0,-1,-1,-1,-1,-1,1,-1,-1,-1,...,1,Bachelor of Science (SDE),Male,Halls,2,1,1,2,2.0,2.0


In [52]:
mental_df.to_csv('cleaned_data.csv', index=False)

In [53]:
print("create table cleaned_data (")
for column in mental_df.columns: 
    print("=", column.replace(" ", "_").upper(), "% varchar,")

create table cleaned_data (
= VOLUNTEERED_FOR_ARTS/CULTURE/HERITAGE % varchar,
= INTERESTED_IN_ARTS_&_CULTURE % varchar,
= INTERESTED_IN_SCIENCE_&_TECHNOLOGY % varchar,
= INTERESTED_IN_RESEARCH_AND_INDEPENDENT_STUDY % varchar,
= INTERESTED_IN_SPORTS % varchar,
= INTERESTED_IN_OTHER_COMPETITIONS_(EG_CASE,_DEBATES) % varchar,
= INTERESTED_IN_ENTREPRENEURSHIP % varchar,
= INTERESTED_IN_VOLUNTEERING % varchar,
= INTERESTED_IN_OTHERS % varchar,
= ARE_YOU_STRESSED_ABOUT_ADJUSTMENT_IN_NEW_ENVIRONMENT_ % varchar,
= ARE_YOU_WORRIED_ABOUT_ACADEMIC_ISSUES % varchar,
= ARE_YOU_STRESSED_ABOUT_FINANCIAL_ISSUES_ % varchar,
= ARE_YOU_STRESSED_ABOUT_YOUR_FAMILY % varchar,
= ARE_YOU_ANXIOUS_ABOUT_FRIENDSHIPS_ % varchar,
= ARE_YOU_HAPPY_IN_YOUR_ROMANTIC_RELATIONSHIP_ % varchar,
= ARE_YOU_STRESSED_ABOUT_YOUR_HEALTH_AND_WELLBEING_ % varchar,
= ARE_YOU_STRESSED_FOR_CAREER_ % varchar,
= DO_YOU_INVOLVE_IN_CLUBS,ACTIVITIES_AND_OTHER_EVENTS_IN_COLLEGE_ % varchar,
= STRESSED_ABOUT_OTHERS_ % varchar,
= CAREER % v

Bad pipe message: %s [b'\x90\xe9\x9d9}\xa4P\x8a$\x08sq\x13\x8e\xd0\xa2\xd3\t /(\x17\x13;iXJ\x95\x10\x1b']
Bad pipe message: %s [b"\xfa{\x9d\xc6\xd1\xaeiZi\xc0\xb1\xf5\xa9{xC\x90\xf4 z\x9a\xafLg'i\x82R\xa7\xf1\xf0\x8a?\x11\x96>\x14c\xac\xd13\x1d\xd9\x8dN~\xf7\xc0KR[\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e"]
Bad pipe message: %s [b'\xcb\xe3\x12\x05\xe8\x8bh8\xee[\xae\x0788\nT\xceq\x00\x00\xa6\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+']
Bad pipe message: %s [b"\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x009\x008\x00\x88\x00\x87\xc0\t\xc0\x13\x003\x002\x00\x9a\x00\x99\x00E\x00D\xc0\x07\xc0\x11\xc0\x08\xc0\x12\x00\x16\x00\x13\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\x