# Data Preprocessing

# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import random


# Read Data 

In [3]:
df = pd.read_csv(r"C:\Nishanthi\Hope AI\Projects\Depression Detection\Ordered_Formate\1.Data Collection\Mental Health Dataset.csv")


In [4]:
df.head()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
0,2014-08-27 11:29:31,Female,United States,Corporate,,No,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Not sure
1,2014-08-27 11:31:50,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,No
2,2014-08-27 11:32:39,Female,United States,Corporate,,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes
3,2014-08-27 11:37:59,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes
4,2014-08-27 11:43:36,Female,United States,Corporate,No,Yes,Yes,1-14 days,Yes,No,Yes,Medium,No,No,Yes,No,Yes


# Describe Data

In [5]:
df.describe()

Unnamed: 0,Timestamp,Gender,Country,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options
count,292364,292364,292364,292364,287162,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364,292364
unique,734,2,35,5,2,2,2,5,3,3,3,3,2,3,3,3,3
top,2014-08-27 12:31:41,Male,United States,Housewife,No,No,Yes,1-14 days,Maybe,Yes,No,Medium,No,No,Maybe,No,No
freq,780,239850,171308,66351,257994,176832,147606,63548,99985,109523,104018,101064,154328,105843,103393,232166,118886


# Shape of Dataset

In [6]:
df.shape

(292364, 17)

# Check Number of Null Values present in Dataset

In [7]:
df.isna().sum()

Timestamp                     0
Gender                        0
Country                       0
Occupation                    0
self_employed              5202
family_history                0
treatment                     0
Days_Indoors                  0
Growing_Stress                0
Changes_Habits                0
Mental_Health_History         0
Mood_Swings                   0
Coping_Struggles              0
Work_Interest                 0
Social_Weakness               0
mental_health_interview       0
care_options                  0
dtype: int64

# Cleaning Dataset

# Replace NaN / null values  with string 'None'

In [8]:
# fill nan values 
df = df.fillna('None')

# Removing uncertain target class with 'Maybe' rows to improve model training on clearly labeled data

In [9]:
df = df[df['Growing_Stress'] != 'Maybe']

# Extract Month from Timestamp column


In [10]:
# get month from Timestamp
df['Month'] = pd.to_datetime(df['Timestamp'], dayfirst=True).dt.month
df = df.drop(columns=['Timestamp'])

  df['Month'] = pd.to_datetime(df['Timestamp'], dayfirst=True).dt.month


# Dropping  duplicates

In [11]:
df = df.drop_duplicates()
df.shape

(73925, 17)

# Dropping the 'country' column as it is not required for modeling

In [12]:
df = df.drop(columns=['Country'])

# save cleaned Dataset
df.to_csv('Cleaned_dataset.csv',index=False)

# Encoding 'Days_Indoors' categorical values into approximate numeric ranges

In [13]:
np.unique(df['Days_Indoors'])

array(['1-14 days', '15-30 days', '31-60 days', 'Go out Every day',
       'More than 2 months'], dtype=object)

In [14]:
# Preprocess the  columns data
def Days_Indoors_encode(value):
    if value == '1-14 days':
        return np.random.randint(1, 15)  # Random number between 1 and 14
    elif value == '15-30 days':
        return np.random.randint(15, 31) # Random number between 15 and 30
    elif value == '31-60 days':
        return np.random.randint(31, 61) # Random number between 31 and 60
    elif value == 'Go out Every day':
        return 0  # Assuming 0 means going out every day
    elif value == 'More than 2 months':
        return np.random.randint(61, 181) # Random number greater than 60 (e.g., up to 6 months)
    else:
        print(value, "nnnno")

In [15]:
temp = df['Days_Indoors'].to_list()
for i in range(df.shape[0]):
    temp[i] = Days_Indoors_encode(temp[i])
df['Days_Indoors'] = temp
df.head()

Unnamed: 0,Gender,Occupation,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options,Month
0,Female,Corporate,,No,Yes,12,Yes,No,Yes,Medium,No,No,Yes,No,Not sure,8
1,Female,Corporate,,Yes,Yes,12,Yes,No,Yes,Medium,No,No,Yes,No,No,8
2,Female,Corporate,,Yes,Yes,12,Yes,No,Yes,Medium,No,No,Yes,No,Yes,8
3,Female,Corporate,No,Yes,Yes,8,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes,8
4,Female,Corporate,No,Yes,Yes,7,Yes,No,Yes,Medium,No,No,Yes,No,Yes,8


# Apply Onehot Encoding to Occupation Column

In [16]:
df= pd.get_dummies(df, columns=['Occupation'], drop_first=True)

In [17]:
# Identify one-hot encoded columns (those that start with 'Occupation_')
onehot_columns = [col for col in df.columns if col.startswith('Occupation_')]

# Convert only those columns from True/False to 0/1
df[onehot_columns] = df[onehot_columns].astype(int)
df.head(4)

Unnamed: 0,Gender,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options,Month,Occupation_Corporate,Occupation_Housewife,Occupation_Others,Occupation_Student
0,Female,,No,Yes,9,Yes,No,Yes,Medium,No,No,Yes,No,Not sure,8,1,0,0,0
1,Female,,Yes,Yes,2,Yes,No,Yes,Medium,No,No,Yes,No,No,8,1,0,0,0
2,Female,,Yes,Yes,10,Yes,No,Yes,Medium,No,No,Yes,No,Yes,8,1,0,0,0
3,Female,No,Yes,Yes,6,Yes,No,Yes,Medium,No,No,Yes,Maybe,Yes,8,1,0,0,0


# Mapping categorical values in selected columns to numerical representations

In [18]:
column_name = ['Gender', 'self_employed', 'Changes_Habits', 'Mental_Health_History', 'Mood_Swings', 'Work_Interest', 'Social_Weakness', 'mental_health_interview', 'care_options']
mapping = {'No':0, 'Maybe': 1, 'Yes':2, 'Not sure':1, 'Low':0, 'Medium':1, 'High':2,'None':1,'Female':0, 'Male':1}
df[column_name] = df[column_name].applymap(lambda x: mapping.get(x, x))


  df[column_name] = df[column_name].applymap(lambda x: mapping.get(x, x))


In [19]:
column_name = ['family_history', 'treatment',
       'Growing_Stress','Coping_Struggles']
mapping = {'No':0, 'Yes':1}
df[column_name] = df[column_name].applymap(lambda x: mapping.get(x, x))
df[column_name].head()
df.head(4)

  df[column_name] = df[column_name].applymap(lambda x: mapping.get(x, x))


Unnamed: 0,Gender,self_employed,family_history,treatment,Days_Indoors,Growing_Stress,Changes_Habits,Mental_Health_History,Mood_Swings,Coping_Struggles,Work_Interest,Social_Weakness,mental_health_interview,care_options,Month,Occupation_Corporate,Occupation_Housewife,Occupation_Others,Occupation_Student
0,0,1,0,1,9,1,0,2,1,0,0,2,0,1,8,1,0,0,0
1,0,1,1,1,2,1,0,2,1,0,0,2,0,0,8,1,0,0,0
2,0,1,1,1,10,1,0,2,1,0,0,2,0,2,8,1,0,0,0
3,0,0,1,1,6,1,0,2,1,0,0,2,1,2,8,1,0,0,0


# Save Preprocessed Datas

In [20]:
df.to_csv('Preprocessed_data.csv', index=False)