# Data Analysis Functions

In [3]:
import pandas as pd

In [4]:
data = {
    'Employee': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'],
    'Salary': [70000, 80000, 75000, 82000, 68000]
}

In [5]:
df = pd.DataFrame(data)
df

Unnamed: 0,Employee,Department,Salary
0,Alice,HR,70000
1,Bob,IT,80000
2,Charlie,Finance,75000
3,David,IT,82000
4,Eve,HR,68000


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Employee    5 non-null      object
 1   Department  5 non-null      object
 2   Salary      5 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 252.0+ bytes


In [8]:
df.columns

Index(['Employee', 'Department', 'Salary'], dtype='object')

In [9]:
df.keys()

Index(['Employee', 'Department', 'Salary'], dtype='object')

In [None]:
df.head() # first 5 rows (defaault)

Unnamed: 0,Employee,Department,Salary
0,Alice,HR,70000
1,Bob,IT,80000
2,Charlie,Finance,75000
3,David,IT,82000
4,Eve,HR,68000


In [11]:
df.head(2)  # First 2 rows

Unnamed: 0,Employee,Department,Salary
0,Alice,HR,70000
1,Bob,IT,80000


In [12]:
df.tail(2)  # Last 2 rows

Unnamed: 0,Employee,Department,Salary
3,David,IT,82000
4,Eve,HR,68000


In [13]:
df.values

array([['Alice', 'HR', 70000],
       ['Bob', 'IT', 80000],
       ['Charlie', 'Finance', 75000],
       ['David', 'IT', 82000],
       ['Eve', 'HR', 68000]], dtype=object)

In [None]:
df.size  # Get all elements

15

In [None]:
df.shape    # get (row, column)

(5, 3)

In [None]:
df.shape[0]  # Get number of rows

5

In [18]:
df.shape[1]  # Get number of columns

3

In [None]:
# .unique() function
df['Department'].unique()

array(['HR', 'IT', 'Finance'], dtype=object)

In [22]:
# .nunique() function - Get number of unique() count
df['Department'].nunique()

3

In [None]:
# .value_counts() function - Get counts of each unique values
df['Department'].value_counts()

Department
HR         2
IT         2
Finance    1
Name: count, dtype: int64

In [None]:
# .apply()
def calculate_bonus(salary):
    return salary * 0.10

df['Salary'].apply(calculate_bonus)

0    7000.0
1    8000.0
2    7500.0
3    8200.0
4    6800.0
Name: Salary, dtype: float64

In [27]:
# Create a new column 'Bonus' with new caculated data
def calculate_bonus(salary):
    return salary * 0.10

df['Bonus'] = df['Salary'].apply(calculate_bonus)
df

Unnamed: 0,Employee,Department,Salary,Bonus
0,Alice,HR,70000,7000.0
1,Bob,IT,80000,8000.0
2,Charlie,Finance,75000,7500.0
3,David,IT,82000,8200.0
4,Eve,HR,68000,6800.0


In [None]:
# Create a new column 'Adjusted Salary' with new caculated data using lambda
df['Adjusted Salary'] = df['Salary'].apply(lambda x: x * 1.05)
df

Unnamed: 0,Employee,Department,Salary,Bonus,Adjusted Salary
0,Alice,HR,70000,7000.0,73500.0
1,Bob,IT,80000,8000.0,84000.0
2,Charlie,Finance,75000,7500.0,78750.0
3,David,IT,82000,8200.0,86100.0
4,Eve,HR,68000,6800.0,71400.0


In [29]:
# Make all string uppercase. lambda apply to all cell if type is string otherwise do nothing
df.applymap(lambda x: x.upper() if type(x) == str else x)

  df.applymap(lambda x: x.upper() if type(x) == str else x)


Unnamed: 0,Employee,Department,Salary,Bonus,Adjusted Salary
0,ALICE,HR,70000,7000.0,73500.0
1,BOB,IT,80000,8000.0,84000.0
2,CHARLIE,FINANCE,75000,7500.0,78750.0
3,DAVID,IT,82000,8200.0,86100.0
4,EVE,HR,68000,6800.0,71400.0


# Sorting

In [None]:
# Sorting - low to high
df.sort_values('Salary')

Unnamed: 0,Employee,Department,Salary,Bonus,Adjusted Salary
4,Eve,HR,68000,6800.0,71400.0
0,Alice,HR,70000,7000.0,73500.0
2,Charlie,Finance,75000,7500.0,78750.0
1,Bob,IT,80000,8000.0,84000.0
3,David,IT,82000,8200.0,86100.0


In [None]:
# Sorting in reverse order - high to low
df.sort_values('Salary', ascending=False)

Unnamed: 0,Employee,Department,Salary,Bonus,Adjusted Salary
3,David,IT,82000,8200.0,86100.0
1,Bob,IT,80000,8000.0,84000.0
2,Charlie,Finance,75000,7500.0,78750.0
0,Alice,HR,70000,7000.0,73500.0
4,Eve,HR,68000,6800.0,71400.0


In [32]:
# Sorting multiple columns - low to high
df.sort_values(['Department', 'Salary'])

Unnamed: 0,Employee,Department,Salary,Bonus,Adjusted Salary
2,Charlie,Finance,75000,7500.0,78750.0
4,Eve,HR,68000,6800.0,71400.0
0,Alice,HR,70000,7000.0,73500.0
1,Bob,IT,80000,8000.0,84000.0
3,David,IT,82000,8200.0,86100.0


In [33]:
# Sorting multiple columns - low to high
sorted_df = df.sort_values(['Department', 'Salary'])
sorted_df

Unnamed: 0,Employee,Department,Salary,Bonus,Adjusted Salary
2,Charlie,Finance,75000,7500.0,78750.0
4,Eve,HR,68000,6800.0,71400.0
0,Alice,HR,70000,7000.0,73500.0
1,Bob,IT,80000,8000.0,84000.0
3,David,IT,82000,8200.0,86100.0


In [36]:
# Sorting multiple columns - low to high - reset index
sorted_df = df.sort_values(['Department', 'Salary'])
sorted_df.reset_index(drop=True, inplace=True)
sorted_df

Unnamed: 0,Employee,Department,Salary,Bonus,Adjusted Salary
0,Charlie,Finance,75000,7500.0,78750.0
1,Eve,HR,68000,6800.0,71400.0
2,Alice,HR,70000,7000.0,73500.0
3,Bob,IT,80000,8000.0,84000.0
4,David,IT,82000,8200.0,86100.0
