# Load Packages and Data

In [1]:
import datetime
import pandas as pd

Said in the last session that Pandas can pretty much do everything which shows here

Will be going over data cleaning, using ...

In [2]:
data = pd.read_csv('data_raw.csv')

data

Unnamed: 0,Survey Date,What is your age?,What is your gender?,Are you self-employed?,How many years have you worked for your current organization (or been self employed)?,How many employees does your company or organization have?,Do you currently have a mental health disorder?,"If yes, what condition(s) have you been diagnosed with?"
0,01/01/2022,39,Male,0,13.50,26-100,No,
1,01/01/2022,28,Male,0,15.49,26-100,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
2,01/01/2022,26,Male,0,19.41,2025-06-01 00:00:00,No,
3,01/01/2022,32,non-binary,0,6.90,100-500,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
4,01/01/2022,34,Male,1,9.92,,Yes,"Mood Disorder (Depression, Bipolar Disorder, e..."
...,...,...,...,...,...,...,...,...
1428,10/01/2022,39,male,0,3.37,More than 1000,No,
1429,10/01/2022,28,Male,0,10.65,100-500,Yes,"Anxiety Disorder (Generalized, Social, Phobia,..."
1430,10/01/2022,36,Male,0,9.18,100-500,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
1431,10/01/2022,41,male,0,10.44,More than 1000,Yes,Attention Deficit Hyperactivity Disorder


In [3]:
data.dtypes

Survey Date                                                                               object
What is your age?                                                                         object
What is your gender?                                                                      object
Are you self-employed?                                                                     int64
How many years have you worked for your current organization (or been self employed)?    float64
How many employees does your company or organization have?                                object
Do you currently have a mental health disorder?                                           object
If yes, what condition(s) have you been diagnosed with?                                   object
dtype: object

1) Names are ugly (explain why there is a problem)
2) Notice errors (date in bad format, dates in number of employees, gender has multiple encodings for male)
3) NULLS
4) Wrong format with dtypes

# Rename Columns

In [4]:
columns = ['survey_date', 
           'age', 
           'gender', 
           'self_employed',
           'years_employed',
           'number_of_employees', 
           'has_mental_health', 
           'conditions']

data.columns = columns

In [5]:
data

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
0,01/01/2022,39,Male,0,13.50,26-100,No,
1,01/01/2022,28,Male,0,15.49,26-100,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
2,01/01/2022,26,Male,0,19.41,2025-06-01 00:00:00,No,
3,01/01/2022,32,non-binary,0,6.90,100-500,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
4,01/01/2022,34,Male,1,9.92,,Yes,"Mood Disorder (Depression, Bipolar Disorder, e..."
...,...,...,...,...,...,...,...,...
1428,10/01/2022,39,male,0,3.37,More than 1000,No,
1429,10/01/2022,28,Male,0,10.65,100-500,Yes,"Anxiety Disorder (Generalized, Social, Phobia,..."
1430,10/01/2022,36,Male,0,9.18,100-500,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
1431,10/01/2022,41,male,0,10.44,More than 1000,Yes,Attention Deficit Hyperactivity Disorder


# Convert Data Types

In [6]:
data['survey_date'] =  pd.to_datetime(data['survey_date'], format='%d/%m/%Y')
data['self_employed'] = data['self_employed'].astype('bool')

In [7]:
# data['age'] = data['age'].astype(int)

In [8]:
data

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
0,2022-01-01,39,Male,False,13.50,26-100,No,
1,2022-01-01,28,Male,False,15.49,26-100,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
2,2022-01-01,26,Male,False,19.41,2025-06-01 00:00:00,No,
3,2022-01-01,32,non-binary,False,6.90,100-500,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
4,2022-01-01,34,Male,True,9.92,,Yes,"Mood Disorder (Depression, Bipolar Disorder, e..."
...,...,...,...,...,...,...,...,...
1428,2022-01-10,39,male,False,3.37,More than 1000,No,
1429,2022-01-10,28,Male,False,10.65,100-500,Yes,"Anxiety Disorder (Generalized, Social, Phobia,..."
1430,2022-01-10,36,Male,False,9.18,100-500,Yes,"Mood Disorder (Depression, Bipolar Disorder, etc)"
1431,2022-01-10,41,male,False,10.44,More than 1000,Yes,Attention Deficit Hyperactivity Disorder


# Null Values

In [9]:
data.isnull() \
    .sum() \
    .sort_values(ascending = False)

conditions             865
number_of_employees    287
gender                   3
survey_date              0
age                      0
self_employed            0
years_employed           0
has_mental_health        0
dtype: int64

In [10]:
data[data['number_of_employees'].isnull()]

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
4,2022-01-01,34,Male,True,9.92,,Yes,"Mood Disorder (Depression, Bipolar Disorder, e..."
7,2022-01-01,39,Male,True,12.52,,No,
33,2022-01-01,37,Male,True,17.47,,Yes,Depression
34,2022-01-01,25,Male,True,10.51,,No,
36,2022-01-01,43,M,True,20.98,,No,
...,...,...,...,...,...,...,...,...
1413,2022-01-10,39,Male,True,4.59,,Yes,"Mood Disorder (Depression, Bipolar Disorder, e..."
1419,2022-01-10,26,M,True,10.10,,Maybe,
1420,2022-01-10,29,male,True,2.54,,No,
1426,2022-01-10,31,male,True,1.99,,No,


In [None]:
# COULD REMOVE ROWS FOR GENDER, COLUMN FOR CONDITIONS SINCE MAJORITY NULL AND KEEP NUMBER OF EMPLOYEES (INPUTE SELF EMPLOYED THERE)

# Detect Outliers with Functions

Functions must be indented using the TAB key, otherwise they will not work.

"def" for define

Format: 
```
def function_name(input1, ..., inputx):
  {code}
  return {output}
```

In [11]:
def multiply(x, y):
    result = x * y
    return result

In [12]:
multiply(5, 10)

50

In [13]:
data.groupby('survey_date') \
    .size() \
    .sort_values(ascending = False)

survey_date
2022-01-02    144
2022-01-03    144
2022-01-04    143
2022-01-05    143
2022-01-06    143
2022-01-07    143
2022-01-08    143
2022-01-09    143
2022-01-10    143
2022-01-01    138
2025-06-01      5
2023-05-01      1
dtype: int64

In [14]:
def my_counter(column):
    result = data.groupby(column, dropna = False) \
                 .size() \
                 .sort_values(ascending = False)
    return result

In [15]:
my_counter(['survey_date'])

survey_date
2022-01-02    144
2022-01-03    144
2022-01-04    143
2022-01-05    143
2022-01-06    143
2022-01-07    143
2022-01-08    143
2022-01-09    143
2022-01-10    143
2022-01-01    138
2025-06-01      5
2023-05-01      1
dtype: int64

# Fixing Outliers with Conditional Filtering

1) Null the value
2) remove the row
3) remove the column
4) fill with value (previous, mean, median, etc..) but only if appropriate

## Survey Date

In [16]:
data[(data['survey_date'] == '2025-06-01') | (data['survey_date'] == '2023-05-01')]

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
31,2025-06-01,32,Female,False,19.21,2025-06-01 00:00:00,Yes,"Anxiety Disorder (Generalized, Social, Phobia,..."
37,2025-06-01,35,Female,False,5.23,500-1000,No,
45,2025-06-01,40,Male,True,10.1,,No,
53,2023-05-01,22,Male,True,9.34,,Maybe,
88,2025-06-01,31,Female,False,6.33,2025-06-01 00:00:00,Maybe,
140,2025-06-01,28,female,False,4.67,2025-06-01 00:00:00,Maybe,


In [17]:
data['survey_date'][[30, 36, 44, 52, 87, 139]]

30    2022-01-01
36    2022-01-01
44    2022-01-01
52    2022-01-01
87    2022-01-01
139   2022-01-01
Name: survey_date, dtype: datetime64[ns]

In [18]:
data.loc[(data['survey_date'] == '2025-06-01') | (data['survey_date'] == '2023-05-01'), 'survey_date'] = '2022-01-01'

In [19]:
data[data.index == 31]

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
31,2022-01-01,32,Female,False,19.21,2025-06-01 00:00:00,Yes,"Anxiety Disorder (Generalized, Social, Phobia,..."


## Age

In [20]:
my_counter('age')

age
30             94
31             82
29             78
28             74
35             74
32             72
34             69
33             69
26             64
27             63
37             59
39             55
38             54
36             49
25             44
24             42
40             36
22             32
44             31
42             29
43             29
45             27
23             24
41             24
46             22
21             15
47             14
49             13
55             12
48              9
50              9
52              7
54              7
51              7
20              6
56              5
57              4
63              4
19              4
53              3
59              2
61              2
thirty six      1
99              1
66              1
74              1
70              1
forty three     1
15              1
65              1
62              1
58              1
17              1
323             1
3               1
twenty

In [21]:
data[data['age'].isin(['thirty six', 'forty three', 'twenty nine'])]

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
77,2022-01-01,twenty nine,M,False,2.14,26-100,No,
103,2022-01-01,forty three,Female,False,4.03,More than 1000,No,
152,2022-01-02,thirty six,Male,False,15.2,2023-05-01 00:00:00,Yes,"Anxiety Disorder (Generalized, Social, Phobia,..."


In [22]:
numbers = {
  'thirty six': 36,
  'forty three': 43, 
  'twenty nine': 29
}

In [23]:
data = data.replace({'age': numbers})

In [24]:
data['age'] = data['age'].astype(int)

In [25]:
data[(data['age'] <= 18) | (data['age'] >= 80)]

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
325,2022-01-03,99,Other,False,2.44,2025-06-01 00:00:00,Yes,Traumatic Brain Injury
441,2022-01-04,17,male,False,9.4,2025-06-01 00:00:00,No,
631,2022-01-05,323,Male,False,19.65,100-500,No,
926,2022-01-07,3,Male,False,12.84,More than 1000,Maybe,
1227,2022-01-09,15,male,True,5.47,,No,


In [26]:
data.loc[(data['age'] <= 16) | (data['age'] >= 100), 'age'] = data['age'].mean().astype(int)

In [27]:
data[data.index == 631]

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
631,2022-01-05,34,Male,False,19.65,100-500,No,


## Gender

In [28]:
data['gender'] = data['gender'].str.lower()

male = ['Male', 'male', 'M', 'm', 'Cis Male', 'man', 'ostensibly male, unsure what that really means', 'Mail', 'Make', 'male (cis)', 
        'cis male', 'maile', 'Malr', 'Cis Man', 'Mal', 'msle', 'male.', 'sex is male', 'malr', 'cis man', 'mail' ]
     
female = ['Female', 'female', 'F', 'f', 'Woman', 'Femake', 'Female (cis)', 'cis female', 'woman', 'femail', 
     'cis-female/femme', 'i identify as female.', 'cis-woman', 'cisgender female', 'female (props for making this a freeform field, though)', 
     'female/woman', 'female assigned at birth' ]

other = list(data['gender'].value_counts().index)[2:]

data.loc[data['gender'].isin(male), 'gender'] = 'male'
data.loc[data['gender'].isin(female), 'gender'] = 'female'
data.loc[data['gender'].isin(other), 'gender'] = 'others'

In [29]:
my_counter('gender')

gender
male      1040
female     321
others      69
NaN          3
dtype: int64

In [30]:
data[data['gender'].isnull()]

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
318,2022-01-03,24,,False,21.07,More than 1000,Yes,"Mood Disorder (Depression, Bipolar Disorder, e..."
419,2022-01-03,26,,False,0.21,100-500,Yes,"Anxiety Disorder (Generalized, Social, Phobia,..."
496,2022-01-04,25,,False,5.56,26-100,Yes,"Anxiety Disorder (Generalized, Social, Phobia,..."


## Self Employed Flag

In [31]:
my_counter('self_employed')

self_employed
False    1146
True      287
dtype: int64

In [32]:
my_counter(['self_employed', 'number_of_employees'])

self_employed  number_of_employees
False          26-100                 292
True           NaN                    287
False          More than 1000         256
               100-500                248
               2025-06-01 00:00:00    210
               500-1000                80
               2023-05-01 00:00:00     60
dtype: int64

## Number of Employees

In [33]:
categories = ['26-100', '100-500', '500-1000']
data.loc[~(data['number_of_employees'].isin(categories)), 'number_of_employees'] = None

In [34]:
my_counter('number_of_employees')

number_of_employees
NaN         813
26-100      292
100-500     248
500-1000     80
dtype: int64

## Has Mental Health Flag

In [35]:
my_counter('has_mental_health')

has_mental_health
Yes      575
No       531
Maybe    327
dtype: int64

In [36]:
data[(data['has_mental_health'] == 'Yes') & (data['conditions'].isnull())]

Unnamed: 0,survey_date,age,gender,self_employed,years_employed,number_of_employees,has_mental_health,conditions
169,2022-01-02,45,female,False,7.33,500-1000,Yes,
461,2022-01-04,34,male,True,18.98,,Yes,
591,2022-01-05,21,male,False,22.2,,Yes,
632,2022-01-05,35,female,False,18.17,100-500,Yes,
781,2022-01-06,33,male,False,4.66,100-500,Yes,
808,2022-01-06,40,female,False,22.54,,Yes,
1395,2022-01-10,27,male,False,13.79,,Yes,


In [37]:
data['conditions'] = data['conditions'].str.split('|')

In [38]:
data_conditions = data.explode('conditions')