In [1]:
import numpy as np
import pandas as pd

In [2]:
#create an array
arr = np.array([1, 2, 3, 4, 5, 6, 7])
print(arr)

[1 2 3 4 5 6 7]


In [3]:
#array of zeros or ones
zeros = np.zeros((2, 3))
ones = np.ones((3, 2))
print(zeros)
print(ones)

[[0. 0. 0.]
 [0. 0. 0.]]
[[1. 1.]
 [1. 1.]
 [1. 1.]]


In [4]:
#numpy range and reshape
r_arr = np.arange(1, 20, 2)
rsh = np.arange(12).reshape(3, 4)

print("First output:", r_arr)
print("Second ouput:", rsh)

First output: [ 1  3  5  7  9 11 13 15 17 19]
Second ouput: [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


In [5]:
#basic math
a = np.array([10, 20, 30, 40])
b = np.array([1, 2, 3, 4])

print(a + b)
print(a * b)

[11 22 33 44]
[ 10  40  90 160]


In [6]:
#creating series in pandas
series = pd.Series([10, 20, 30, 40, 50], copy="numbers")
print(series)

0    10
1    20
2    30
3    40
4    50
dtype: int64


In [7]:
#creating dataframe in pandas
data = {
    "Name": ["Alice", "Bob", "Chalies"],
    "Age": [25, 30, 40],
    "Salary": [50000, 65000, 80000]
}

df = pd.DataFrame(data)
df["Height"] = [25.50, 30.25, 38.27]
print(df)


      Name  Age  Salary  Height
0    Alice   25   50000   25.50
1      Bob   30   65000   30.25
2  Chalies   40   80000   38.27


In [8]:
#filtering data in pandas
df = df[df["Age"] >= 30]
print(df)

      Name  Age  Salary  Height
1      Bob   30   65000   30.25
2  Chalies   40   80000   38.27


In [9]:
#grouping data using pandas groupby
group_data = {
    "Department": ['IT', 'HR', 'IT', 'HR', 'Finance', 'HR', 'IT', 'Finance', 'HR', 'IT'],
    "Salary": [50000, 40000, 45000, 55000, 42000, 46000, 58000, 65000, 68000, 55000],
    "Bonus": [5000, 2000, 3000, 4000, 5500, 2500, 1500, 2000, 3000, 3500]
}

df = pd.DataFrame(group_data)
df.head(10)

Unnamed: 0,Department,Salary,Bonus
0,IT,50000,5000
1,HR,40000,2000
2,IT,45000,3000
3,HR,55000,4000
4,Finance,42000,5500
5,HR,46000,2500
6,IT,58000,1500
7,Finance,65000,2000
8,HR,68000,3000
9,IT,55000,3500


In [10]:
avg_group = df.groupby('Department')['Salary'].mean()
print(avg_group)

sum_group = df.groupby('Department')['Salary'].sum()
print(sum_group)

count_group = df.groupby('Department')['Salary'].count()
print(count_group)

Department
Finance    53500.0
HR         52250.0
IT         52000.0
Name: Salary, dtype: float64
Department
Finance    107000
HR         209000
IT         208000
Name: Salary, dtype: int64
Department
Finance    2
HR         4
IT         4
Name: Salary, dtype: int64


In [11]:
#merging data using pandas merge
data1 = {
    "EmployeeID": [1, 2, 3],
    "Salary": [50000, 40000, 30000],
    "Name": ["Alice", "John", "Janet"]
}

data2 = {
    "EmployeeID": [1, 3, 5],
    "Age": [25, 30, 32],
    "Gender": ["Female", "Male", "Female"]
}

data3 = {
    "EmployeeID": [1, 3, 6],
    "Height": [20.25, 23.45, 29.65],
    "Department": ["IT", "HR", "Finance"]
}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
df3 = pd.DataFrame(data3)

merge_data = pd.merge(df1, df2, on="EmployeeID")
last_merge_data = pd.merge(merge_data, df3, on="EmployeeID")
print(last_merge_data)

   EmployeeID  Salary   Name  Age  Gender  Height Department
0           1   50000  Alice   25  Female   20.25         IT
1           3   30000  Janet   30    Male   23.45         HR


In [12]:
#handling missing values using pandas
df = pd.DataFrame({
    "Name": ["Alice", np.nan, "Chalie", np.nan, "Janet"],
    "Age": [np.nan, 20, 25, 30, np.nan],
    "Height": [25.50, 30.45, np.nan, 35.65, np.nan]
})


df.isnull().sum()

Name      2
Age       2
Height    2
dtype: int64

In [13]:
df.head()

Unnamed: 0,Name,Age,Height
0,Alice,,25.5
1,,20.0,30.45
2,Chalie,25.0,
3,,30.0,35.65
4,Janet,,


In [14]:
drop_null = df.dropna()
drop_null.head()

Unnamed: 0,Name,Age,Height


In [17]:
df_filled = df.fillna({
    "Name": "Unknown",
    "Age": df["Age"].mean(),
    "Height": df["Height"].median()
})

df_filled.head()

Unnamed: 0,Name,Age,Height
0,Alice,25.0,25.5
1,Unknown,20.0,30.45
2,Chalie,25.0,30.45
3,Unknown,30.0,35.65
4,Janet,25.0,30.45


employees = 'EmployeeID': [1, 2, 3, 4, 5, 6],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'DepartmentID': [101, 102, 101, 103, 102, None],
    'Age': [25, 30, None, 40, 29, 33],
    'Salary': [50000, 60000, 55000, 70000, None, 62000]

departments = 'DepartmentID': [101, 102, 103],
    'DepartmentName': ['IT', 'HR', 'Finance'],
    'Location': ['Lagos', 'Abuja', 'Port Harcourt']

In [19]:
#simple example data using pandas
employees = {
    'EmployeeID': [1, 2, 3, 4, 5, 6],
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank'],
    'DepartmentID': [101, 102, 101, 103, 102, np.nan],
    'Age': [25, 30, np.nan, 40, 29, 33],
    'Salary': [50000, 60000, 55000, 70000, np.nan, 62000]
}

departments = {
    'DepartmentID': [101, 102, 103],
    'DepartmentName': ['IT', 'HR', 'Finance'],
    'Location': ['Lagos', 'Abuja', 'Port Harcourt']
}

In [21]:
employees_df = pd.DataFrame(employees)
departments_df = pd.DataFrame(departments)

employees_df.to_csv("employees.csv", index=False)
departments_df.to_csv("departments.csv", index=False)

In [26]:
#load dataset
empdf = pd.read_csv("employees.csv")
dptdf = pd.read_csv("departments.csv")

empdf.head()
dptdf.head()

Unnamed: 0,DepartmentID,DepartmentName,Location
0,101,IT,Lagos
1,102,HR,Abuja
2,103,Finance,Port Harcourt


In [28]:
avg_salary = empdf.groupby("DepartmentID")['Salary'].mean()
avg_salary

DepartmentID
101.0    52500.0
102.0    60000.0
103.0    70000.0
Name: Salary, dtype: float64

In [29]:
merge_df = pd.merge(empdf, dptdf, on="DepartmentID")
merge_df

Unnamed: 0,EmployeeID,Name,DepartmentID,Age,Salary,DepartmentName,Location
0,1,Alice,101.0,25.0,50000.0,IT,Lagos
1,2,Bob,102.0,30.0,60000.0,HR,Abuja
2,3,Charlie,101.0,,55000.0,IT,Lagos
3,4,David,103.0,40.0,70000.0,Finance,Port Harcourt
4,5,Eva,102.0,29.0,,HR,Abuja


In [31]:
merge_df.isnull().sum()

EmployeeID        0
Name              0
DepartmentID      0
Age               1
Salary            1
DepartmentName    0
Location          0
dtype: int64

In [35]:
merge_df['Age'] = merge_df['Age'].fillna(merge_df["Age"].mean())
merge_df['Salary'] = merge_df["Salary"].fillna(merge_df['Salary'].median())
merge_df


Unnamed: 0,EmployeeID,Name,DepartmentID,Age,Salary,DepartmentName,Location
0,1,Alice,101.0,25.0,50000.0,IT,Lagos
1,2,Bob,102.0,30.0,60000.0,HR,Abuja
2,3,Charlie,101.0,31.0,55000.0,IT,Lagos
3,4,David,103.0,40.0,70000.0,Finance,Port Harcourt
4,5,Eva,102.0,29.0,57500.0,HR,Abuja


In [36]:
merge_df.isnull().sum()

EmployeeID        0
Name              0
DepartmentID      0
Age               0
Salary            0
DepartmentName    0
Location          0
dtype: int64