Task 1: Filtering Data with Multiple Conditions
Problem: Create a DataFrame with employee data and filter it to find employees who:

Are between 25 and 40 years old (inclusive)
Have a salary greater than $60,000
Work in either "Sales" or "Marketing" departments



In [None]:
!pip install faker
import datetime



In [None]:
from faker import Faker
import pandas as pd
import numpy as np
import random

fake = Faker()

def generate_employee_data(n_rows):
    """Generate realistic employee data"""
    data = {
        "Employee_ID": range(1, n_rows + 1),
        "Name": [fake.name() for _ in range(n_rows)],
        "Age": [random.randint(22, 65) for _ in range(n_rows)],
        "Salary": [round(random.uniform(35000, 150000), 2) for _ in range(n_rows)],
        "City": [fake.city() for _ in range(n_rows)],
        "Joining_Date": [fake.date_between(start_date='-10y', end_date='today') for _ in range(n_rows)],
        "Email": [fake.email() for _ in range(n_rows)],
        "Phone": [fake.phone_number() for _ in range(n_rows)],
        "Department": [random.choice(['Sales', 'IT', 'Marketing', 'HR', 'Finance']) for _ in range(n_rows)],
        "Job_Title": [fake.job() for _ in range(n_rows)]
    }
    return pd.DataFrame(data)

# Generate 100 employees
df = generate_employee_data(100)
print(df.head(10))
print(f"\nDataFrame shape: {df.shape}")

# Save to CSV
df.to_csv('employees_data.csv', index=False)

   Employee_ID                  Name  Age     Salary               City  \
0            1        Robert Santana   42   62513.27       Berrychester   
1            2      Brooke Alexander   47   85585.35         Dariustown   
2            3          Kevin Fowler   40  117057.66        Georgemouth   
3            4           David Bruce   57   61966.76   Christopherville   
4            5            James Webb   30   73622.47      South Brianna   
5            6       Kenneth Andrews   34  137698.09  South Monicamouth   
6            7  Elizabeth Spence DDS   48   44266.43      West Kimberly   
7            8        Keith Jennings   64   52345.53   New Cynthiahaven   
8            9          Amber Obrien   46   79049.06       Port Michael   
9           10   Tanner Mitchell DDS   65   52572.61        East Curtis   

  Joining_Date                          Email                   Phone  \
0   2016-08-05    katrinaanderson@example.net     (456)576-6182x51286   
1   2021-10-30        guerra

In [None]:
filtered_data = df[ ( df["Age"] >= 25) & (df["Age"] <= 40) & (df["Salary"] >= 60000) & (df["Department"].isin(["Sales", "Marketing"]))]

print(filtered_data)

    Employee_ID                Name  Age     Salary               City  \
4             5          James Webb   30   73622.47      South Brianna   
5             6     Kenneth Andrews   34  137698.09  South Monicamouth   
29           30        Susan Turner   35  119482.92          Paulmouth   
45           46         Daniel Kane   36  111244.97         Mooreburgh   
47           48        Scott Thomas   31   84290.47           Diazbury   
63           64         Patricia Le   31   60785.55    South Waynebury   
69           70         Parker Cain   33  131511.32           Westtown   
75           76  Courtney Rodriguez   29   88478.91   Lake Zacharybury   
94           95          Chad Scott   37  115177.20         Michaelton   

   Joining_Date                      Email                  Phone Department  \
4    2019-01-07        james84@example.org  001-695-299-5342x8364  Marketing   
5    2016-08-09    christina18@example.net        +1-792-868-7056      Sales   
29   2016-06-23    

In [None]:
select_specific = df.loc[2:4, ["Name", "Age", "Salary"]]

print(select_specific)

           Name  Age     Salary
2  Kevin Fowler   40  117057.66
3   David Bruce   57   61966.76
4    James Webb   30   73622.47


In [None]:
select_specific = df.loc[2:4, ["Name", "Age", "Salary"]]

print(select_specific)

           Name  Age     Salary
2  Kevin Fowler   40  117057.66
3   David Bruce   57   61966.76
4    James Webb   30   73622.47


In [None]:
select_col = df.iloc[0:3, 0:2]

filter_first = df[(df['Age'] >30 )]

finally_filtered = filter_first.loc[:, ['Name', 'Department']]

print(finally_filtered)

                Name Department
0     Robert Santana      Sales
1   Brooke Alexander  Marketing
2       Kevin Fowler    Finance
3        David Bruce         HR
5    Kenneth Andrews      Sales
..               ...        ...
95      Adam Burgess    Finance
96     James Padilla    Finance
97      Sandra Drake    Finance
98    Scott Williams  Marketing
99     Thomas Atkins      Sales

[88 rows x 2 columns]


In [None]:
missing_values = df.isnull().sum()


mean_salary = df["Salary"].mean()

df["Salary"].fillna(mean_salary, inplace=True)

df["Department"].fillna("Unknown", inplace=True)

df.dropna(subset=["Age"], inplace=True)

new_df = df.dropna()

print(new_df)

    Employee_ID              Name  Age     Salary              City  \
0             1    Robert Santana   42   62513.27      Berrychester   
1             2  Brooke Alexander   47   85585.35        Dariustown   
2             3      Kevin Fowler   40  117057.66       Georgemouth   
3             4       David Bruce   57   61966.76  Christopherville   
4             5        James Webb   30   73622.47     South Brianna   
..          ...               ...  ...        ...               ...   
95           96      Adam Burgess   62   69682.01        Joshuastad   
96           97     James Padilla   39  113689.79      Prestonmouth   
97           98      Sandra Drake   55  127635.72       Morganburgh   
98           99    Scott Williams   53  126566.13          Holtstad   
99          100     Thomas Atkins   62  100082.24     Lauriechester   

   Joining_Date                        Email                  Phone  \
0    2016-08-05  katrinaanderson@example.net    (456)576-6182x51286   
1    

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Salary"].fillna(mean_salary, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Department"].fillna("Unknown", inplace=True)


In [None]:
from faker import Faker
import pandas as pd
import numpy as np
import random
from datetime import datetime

fake = Faker()
Faker.seed(42)
random.seed(42)

def generate_student_data(n_students):
    majors = ['Computer Science', 'Business', 'Engineering', 'Psychology',
              'Biology', 'Mathematics', 'English', 'History']

    data = {
        "Student_ID": [f"STU{str(i).zfill(6)}" for i in range(1, n_students + 1)],
        "Name": [fake.name() for _ in range(n_students)],
        "Email": [fake.email() for _ in range(n_students)],
        "Date_of_Birth": [fake.date_of_birth(minimum_age=18, maximum_age=25) for _ in range(n_students)],
        "Enrollment_Date": [fake.date_between(start_date='-4y', end_date='today') for _ in range(n_students)],
        "Major": [random.choice(majors) for _ in range(n_students)],
        "GPA": [round(random.uniform(2.0, 4.0), 2) for _ in range(n_students)],
        "Credits_Completed": [random.randint(0, 150) for _ in range(n_students)],
        "Scholarship": [random.choice([True, False]) for _ in range(n_students)]
    }

    return pd.DataFrame(data)

df = generate_student_data(100)
print("=" * 80)
print("TASK 4: STUDENT DATA TRANSFORMATION")
print("=" * 80)

TASK 4: STUDENT DATA TRANSFORMATION


In [None]:
deans_list = df[(df["GPA"] >= 3.7) & (df["Credits_Completed"] >= 60)]

In [None]:
statistics = df.groupby("Major").agg({
    "GPA": ["mean", "max", "min"],
    "Credits_Completed": ["mean", "max", "min"],
    "Scholarship": "sum"
}).reset_index()