In [27]:
#day11
import pandas as pd
data = [[1, 100], [2, 200], [3, 300]]
employee = pd.DataFrame(data, columns=['id', 'salary']).astype({'id':'int64', 'salary':'int64'})

# Reasoning
# Sort the employee by salary largest to smallest
# Find the first value that is not equal to the first salary | remove duplicates and get the nth salary (keep first indtance)
# returns a generate frame
def second_highest_salary(employee: pd.DataFrame) -> pd.DataFrame:
    # Creates a non duplicate dataframe of the salary column
    unique_salaries = employee["salary"].drop_duplicates()
    if len(unique_salaries) < 2:
        return pd.DataFrame({'SecondHighestSalary': [None]})
        
    unique_salaries = unique_salaries.sort_values(ascending=False)
    secondHighest = unique_salaries.iloc[1]
    return pd.DataFrame({'SecondHighestSalary': [secondHighest]})
    
# Panda series is a one dimentional labeled array - i.e., single column of dataframe
# Data frame is two dimensional structure labeled

second_highest_salary(employee)

Unnamed: 0,SecondHighestSalary
0,200


In [None]:
"""
SQL version
SELECT (
    SELECT DISTINCT salary
    FROM Employee ORDER BY salary DESC 
    LIMIT 1 OFFSET 1   #Limits to 1 row and skips the first row
) as SecondHighestSalary # Wraps the out and renames the result as second highest salary
"""

In [44]:
#day12 Not accepted

import pandas as pd
data = [[1, 'Joe', 70000, 1], [2, 'Jim', 90000, 1], [3, 'Henry', 80000, 2], [4, 'Sam', 60000, 2], [5, 'Max', 90000, 1]]
employee = pd.DataFrame(data, columns=['id', 'name', 'salary', 'departmentId']).astype({'id':'Int64', 'name':'object', 'salary':'Int64', 'departmentId':'Int64'})
data = [[1, 'IT'], [2, 'Sales']]
department = pd.DataFrame(data, columns=['id', 'name']).astype({'id':'Int64', 'name':'object'})

# Highest salary each department
# Group employees by department
# Order by distinct / get the maximum row
# Recreate the dataframe using the id for that department

# See this blog = https://saturncloud.io/blog/how-to-select-row-with-max-value-in-column-from-pandas-groupby-groups/
# Transform can be used to broadcast back the group function onto the dataframe
def department_highest_salary(employee: pd.DataFrame, department: pd.DataFrame) -> pd.DataFrame:
    # Use groupby to group the data by a specific column
    groups = employee.groupby('departmentId')
    # Get the idx of the max values of specific column. idx is a pandas series that contains row index of maximum value of each group
    idx = groups['salary'].idxmax()
    # loc selects rows with these indexes
    max_salaries = employee.loc[idx]

    # Do a lookup and replace the departmentId - use pandas repalce
    # using replace we need to generate a dictionary to map id to name
    # We will use a simple zip
    depMap = dict(zip(department['id'], department['name']))

    # Altenatively since we know that id is the primary key we can use set index to to dict on the dataframe
    depMap = department.set_index('id')['name'].to_dict()

    # Unfortunately the above doens't work in replcae since the data type is diffrent we will use map
    # ! will fail due to column type: max_salaries.replace({'departmentId': depMap})
    max_salaries['Department'] = max_salaries['departmentId'].map(depMap)
    
    # rename / reorder the data frame
    return pd.DataFrame({'Department': max_salaries['Department'], 'Employee': max_salaries['name'], 
                        'Salary': max_salaries['salary']})

department_highest_salary(employee, department)

Unnamed: 0,Department,Employee,Salary
1,IT,Jim,90000
2,Sales,Henry,80000


In [53]:
#day12 Using probper boolean filtering for duplicate max entries
# Accepted
import pandas as pd
data = [[1, 'Joe', 70000, 1], [2, 'Jim', 90000, 1], [3, 'Henry', 80000, 2], [4, 'Sam', 60000, 2], [5, 'Max', 90000, 1]]
employee = pd.DataFrame(data, columns=['id', 'name', 'salary', 'departmentId']).astype({'id':'Int64', 'name':'object', 'salary':'Int64', 'departmentId':'Int64'})
data = [[1, 'IT'], [2, 'Sales']]
department = pd.DataFrame(data, columns=['id', 'name']).astype({'id':'Int64', 'name':'object'})

"""
SQL
SELECT DEPT.name AS Department, EMP.name AS Employee, EMP.salary AS 

Salary FROM Department DEPT, Employee EMP WHERE

EMP.departmentId = DEPT.id AND (EMP.departmentId, salary) IN 

(SELECT departmentId, MAX (salary) FROM Employee GROUP BY 

departmentId)
"""
def department_highest_salary(employee: pd.DataFrame, department: pd.DataFrame) -> pd.DataFrame:

    # get all the max salaries
    max_salaries = employee.groupby('departmentId')['salary'].transform('max')

    # generate a boolean series where the salary is equal to the max
    df = employee[max_salaries == employee['salary']]

    # Generate the map to map id to department name
    depMap = department.set_index('id')['name'].to_dict()

    #Ensure the original dataframe is being modified - otherwise we get A value is trying to be set on a copy of a slice from a DataFrame.
    df.loc[:,'Department'] = df['departmentId'].map(depMap)
    
    # rename / reorder the data frame
    return pd.DataFrame({'Department': df['Department'], 'Employee': df['name'], 
                        'Salary': df['salary']})

department_highest_salary(employee, department)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'Department'] = df['departmentId'].map(depMap)


Unnamed: 0,Department,Employee,Salary
1,IT,Jim,90000
2,Sales,Henry,80000
4,IT,Max,90000
