## Data Manipulation with pandas

In [1]:
import pandas as pd

In [2]:
data = {
    'Name': ['John', 'Alex', 'Sara', 'Emily', 'Saint'],
    'Age': [25, 30, 22, 28, 35],
    'Salary': [50000, 60000, 45000, 55000, 70000],
    'Department': ['Data Eng', 'Analytics', 'IT', 'HR', 'Finance']
}
df = pd.DataFrame(data)

In [3]:
# Display basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Name        5 non-null      object
 1   Age         5 non-null      int64 
 2   Salary      5 non-null      int64 
 3   Department  5 non-null      object
dtypes: int64(2), object(2)
memory usage: 292.0+ bytes


In [4]:
# Handling Missing values
df.fillna('Unknown', inplace=True)

In [5]:
df.head()

Unnamed: 0,Name,Age,Salary,Department
0,John,25,50000,Data Eng
1,Alex,30,60000,Analytics
2,Sara,22,45000,IT
3,Emily,28,55000,HR
4,Saint,35,70000,Finance


In [6]:
# Sorting and Filtering
high_salary = df[df['Salary'] > 50000].sort_values(by='Salary', ascending=False)

In [7]:
print('Employee with high salary: \n',high_salary)

Employee with high salary: 
     Name  Age  Salary Department
4  Saint   35   70000    Finance
1   Alex   30   60000  Analytics
3  Emily   28   55000         HR


In [8]:
# Grouping and Aggregation
avg_salary_per_dept = df.groupby("Department")['Salary'].mean()

In [9]:
print('Average salary per department:\n', avg_salary_per_dept) 

Average salary per department:
 Department
Analytics    60000.0
Data Eng     50000.0
Finance      70000.0
HR           55000.0
IT           45000.0
Name: Salary, dtype: float64


In [10]:
# Adding a new column
df['Bonus'] = df['Salary'] * 0.10 # 10% Bonus Calculation
print('Updated DataFrane with Bonus: \n', df)

Updated DataFrane with Bonus: 
     Name  Age  Salary Department   Bonus
0   John   25   50000   Data Eng  5000.0
1   Alex   30   60000  Analytics  6000.0
2   Sara   22   45000         IT  4500.0
3  Emily   28   55000         HR  5500.0
4  Saint   35   70000    Finance  7000.0


In [11]:
# Saving to CSV
df.to_csv('employee_data1.csv', index=True)

### Add a new column for “Experience (Years)” with random values.

In [12]:
df['Experience'] = [2, 2, 1, 3, 4]

In [13]:
df.head()

Unnamed: 0,Name,Age,Salary,Department,Bonus,Experience (Years)
0,John,25,50000,Data Eng,5000.0,2.0
1,Alex,30,60000,Analytics,6000.0,2.0
2,Sara,22,45000,IT,4500.0,1.0
3,Emily,28,55000,HR,5500.0,2.5
4,Saint,35,70000,Finance,7000.0,4.0


### Find the employee with the highest salary.

In [18]:
highest_salary = df['Salary'].max()
print('Highest Employee salary:\n',high_salary)

Highest Employee salary:
     Name  Age  Salary Department
4  Saint   35   70000    Finance
1   Alex   30   60000  Analytics
3  Emily   28   55000         HR


In [21]:
highest_sal_emp = df[df['Salary'] == df['Salary'].max()]
high_salary

Unnamed: 0,Name,Age,Salary,Department
4,Saint,35,70000,Finance
1,Alex,30,60000,Analytics
3,Emily,28,55000,HR


### Filter employees who are under 30 and earn above 50,000.

In [15]:
young_skilled_employee = df.loc[(df['Age'] < 30) & (df['Salary'] >50000)]

In [23]:
young_skilled_employee

Unnamed: 0,Name,Age,Salary,Department,Bonus,Experience (Years)
3,Emily,28,55000,HR,5500.0,2.5


In [25]:
young_high_earners = df[(df['Age'] < 30) & (df['Salary'] > 50000)]
young_high_earners

Unnamed: 0,Name,Age,Salary,Department,Bonus,Experience (Years)
3,Emily,28,55000,HR,5500.0,2.5
