# Load Packages and Data

In [None]:
import pandas as pd

Said in the last session that Pandas can pretty much do everything which shows here

Will be going over data cleaning, using ...

In [None]:
data = pd.read_csv('data_raw.csv')

data

Data Types:
- Datetime objects which exclusively contains dates and times are a setup in a format for date manipulation
- Integers which contain exclusively numbers which don't include decimals, this only allows whole numbers
- Float contains any number, allowing decimals
- Boolean is a binary indicator flag, specifiying 'Yes' or 'No'.
- Object which can contains words and strings, as well as numbers, dates and any other items.

Data Formats:
- Data Frames, exclusively controled by the Pandas package is an formatted nXn matrix containing all of our rows and columns
- Series or lists are simply one column of data, when we take a subset. We can also specify our own lists for controlling loops and populating datasets.
- Dictionaries: Similar to lists but where each value has a name, given a key:value format.

In [None]:
data.dtypes

1) Names are ugly (explain why there is a problem)
2) Notice errors (date in bad format, dates in number of employees, gender has multiple encodings for male)
3) NULLS
4) Wrong format with dtypes

# Rename Columns

In [None]:
columns = ['survey_date', 
           'age', 
           'gender', 
           'self_employed',
           'number_of_employees',
           'years_employed',
           'has_mental_health', 
           'conditions']

data.columns = columns

In [None]:
data

# Convert Data Types

In [None]:
data['survey_date'] =  pd.to_datetime(data['survey_date'], format='%d/%m/%Y')
data['self_employed'] = data['self_employed'].astype('bool')

In [None]:
# data['age'] = data['age'].astype(int)

In [None]:
data

# Detect Outliers with Functions

Functions must be indented using the TAB key, otherwise they will not work.

"def" for define

Format: 
```
def function_name(input1, ..., inputx):
  {code}
  return {output}
```

In [None]:
def multiply(x, y):
    result = x * y
    return result

In [None]:
multiply(5, 10)

In [None]:
data['survey_date'].value_counts() \
                   .sort_values(ascending = False)

In [None]:
def my_counter(column, prop = False):
    result = data[column].value_counts(normalize = prop, dropna = False) \
                         .sort_values(ascending = False)
    return result

In [None]:
my_counter(['survey_date'])

# Fixing Outliers with Conditional Filtering

1) Null the value
2) fill with value (previous, mean, median, etc..) but only if appropriate

## Survey Date

In [None]:
data[(data['survey_date'] == '2025-06-01') | (data['survey_date'] == '2023-05-01')]

In [None]:
data['survey_date'][[30, 36, 44, 52, 87, 139]]

In [None]:
data.loc[(data['survey_date'] == '2025-06-01') | (data['survey_date'] == '2023-05-01'), 'survey_date'] = '2022-01-01'

In [None]:
data[data.index == 31]

## Age

In [None]:
my_counter('age')

In [None]:
data[data['age'].isin(['thirty six', 'forty three', 'twenty nine'])]

In [None]:
numbers = {
  'thirty six': 36,
  'forty three': 43, 
  'twenty nine': 29
}

In [None]:
data = data.replace({'age': numbers})

In [None]:
data['age'] = data['age'].astype(int)

In [None]:
data[(data['age'] <= 18) | (data['age'] >= 80)]

In [None]:
data.loc[(data['age'] <= 16) | (data['age'] >= 100), 'age'] = data['age'].mean().astype(int)

In [None]:
data[data.index == 631]

## Gender

In [None]:
my_counter('gender')

In [None]:
male = ['Male', 'male', 'M', 'm', 'Cis Male', 'man', 'ostensibly male, unsure what that really means', 'Mail', 'Make', 'male (cis)', 
        'cis male', 'maile', 'Malr', 'Cis Man', 'Mal', 'msle', 'male.', 'sex is male', 'malr', 'cis man', 'mail' ]
     
female = ['Female', 'female', 'F', 'f', 'Woman', 'Femake', 'Female (cis)', 'cis female', 'woman', 'femail', 
     'cis-female/femme', 'i identify as female.', 'cis-woman', 'cisgender female', 'female (props for making this a freeform field, though)', 
     'female/woman', 'female assigned at birth' ]

data['gender'] = data['gender'].replace(male, 'male')
data['gender'] = data['gender'].replace(female, 'female')

other = list(data['gender'].value_counts().index)[2:]
data['gender'] = data['gender'].replace(other, 'other')

In [None]:
my_counter('gender')

In [None]:
data[data['gender'].isnull()]

## Self Employed Flag

In [None]:
my_counter('self_employed')

In [None]:
my_counter(['self_employed', 'number_of_employees'])

## Number of Employees

In [None]:
categories = ['26-100', '100-500', '500-1000', 'More than 1000']
data.loc[~(data['number_of_employees'].isin(categories)), 'number_of_employees'] = None

In [None]:
my_counter('number_of_employees')

## Years Employed

In [None]:
data.describe()

In [None]:
import numpy as np
data['years_employed'] = np.floor(data['years_employed']).astype(int)

In [None]:
data.head(2)

## Has Mental Health Flag

In [None]:
my_counter('has_mental_health')

In [None]:
data[(data['has_mental_health'] == 'Yes') & (data['conditions'].isnull())]

# Null Values

Three options:
1) Fill the value (mean, previous, etc..) if possible and appropriate
2) Remove the row if not too many NULLs
3) Remove the column, often if too many NULLs to effectively analyse

In [None]:
data.isnull() \
    .sum() \
    .sort_values(ascending = False)

In [None]:
data

In [None]:
data = data[~data['gender'].isnull()]

In [None]:
data = data.drop('conditions', axis = 1)

In [None]:
data.loc[data['self_employed'] == True, 'number_of_employees'] = 'Self Employed'

In [None]:
data.isnull() \
    .sum() \
    .sort_values(ascending = False)

# Analysis using Seaborn, functions and Loops

In [None]:
df = data[['gender', 'has_mental_health']].value_counts() \
                                          .rename('count') \
                                          .reset_index()

In [None]:
df

In [None]:
df['proportion'] = df['count'] / df.groupby('gender')['count'].transform('sum')

In [None]:
df

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure()
plt.figure(figsize=(15, 5))
plot = sns.barplot(data=df, x='gender', y='proportion', hue='has_mental_health')
plot.set_xlabel('Gender')
plot.set_ylabel('Proportion')
plot.set_title('Proportion of those who have mental health by Gender')

In [None]:
for i in [1, 2, 3, 4, 5]:
  print(50 * i)

In [None]:
data.columns

In [None]:
for column in ['gender', 'self_employed', 'number_of_employees']:
  df = data[[column, 'has_mental_health']].value_counts() \
                                 .rename('count') \
                                 .reset_index()
  
  df['proportion'] = df['count'] / df.groupby(column)['count'].transform('sum')
  
  plt.figure()
  plt.figure(figsize=(15, 5))
  plot = sns.barplot(data = df, x = column, y = 'proportion', hue = 'has_mental_health')
  plot.set_xlabel(column)
  plot.set_ylabel('Proportion')
  plot.set_title('Proportion of those who have mental health by ' + column)

In [None]:
def my_plotter(column, legend_col):
  df = data[[column, legend_col]].value_counts() \
                                 .rename('count') \
                                 .reset_index()
  
  df['proportion'] = df['count'] / df.groupby(column)['count'].transform('sum')
  
  plt.figure()
  plt.figure(figsize=(15, 5))
  plot = sns.barplot(data = df, x = column, y = 'proportion', hue = legend_col)
  plot.set_xlabel(column)
  plot.set_ylabel('Proportion')
  plot.set_title('Proportion of ' + legend_col + ' by ' + column)

In [None]:
my_plotter('gender', 'has_mental_health')

In [None]:
for column in ['gender', 'self_employed']:
  my_plotter(column, 'number_of_employees')