In [15]:
import numpy as np
import pandas as pd
np.random.seed(42)

In [19]:
#----------------------------------------------------------PROBLEM 1------------------------------------------------------------
data = {
    "age": np.random.randint(18, 60, size=20),
    "income": np.random.randint(20000, 100000, size=20).astype(float)
}
df = pd.DataFrame(data)

#randomizing nan values
df.loc[[2, 5, 7], "income"] = np.nan
df.loc[[3, 6], "age"] = np.nan

df.loc[10, "income"] = 1000000  #to check for outliers


print("Synthetic Dataset with NaNs:\n", df)

Synthetic Dataset with NaNs:
      age     income
0   18.0    28392.0
1   42.0    50535.0
2   24.0        NaN
3    NaN    72256.0
4   41.0    55222.0
5   18.0        NaN
6    NaN    99575.0
7   41.0        NaN
8   28.0    30965.0
9   34.0    44538.0
10  25.0  1000000.0
11  52.0    28110.0
12  52.0    99309.0
13  50.0    47266.0
14  22.0    72992.0
15  59.0    26910.0
16  56.0    20206.0
17  58.0    43419.0
18  45.0    70636.0
19  24.0    70015.0


In [20]:
mean_income = df["income"].mean()
median_income = df["income"].median()

valid_data = df.dropna(subset=["income", "age"])   # remove rows with NaN in either column
weighted_mean_income = np.average(
    valid_data["income"],
    weights=valid_data["age"]
)

print("(a) Mean Income:", mean_income)
print("(b) Median Income:", median_income)
print("(c) Age-Weighted Mean Income:", weighted_mean_income)

(a) Mean Income: 109432.11764705883
(b) Median Income: 50535.0
(c) Age-Weighted Mean Income: 87409.83498349835


A weighted mean is more appropriate when some observations ought to have greater impact on the average because of their weight, reliability, or representativeness. For instance, in calculating general student grades, various exams have varying percentages, so a weighted mean is more representative of the actual performance compared to a plain mean.

In [24]:
#----------------------------------------------------------PROBLEM 2------------------------------------------------------------
income_mean = df["income"].mean(skipna=True)
income_std = df["income"].std(skipna=True)

df["income_z"] = (df["income"] - income_mean) / income_std

outliers = df[df["income_z"].abs() > 3]

print("Standardized Income (z-score):\n", df[["income", "income_z"]])
print("\nNumber of Outliers:", outliers.shape[0])
print("Outlier Rows:\n", outliers)


Standardized Income (z-score):
        income  income_z
0     28392.0 -0.351182
1     50535.0 -0.255227
2         NaN       NaN
3     72256.0 -0.161100
4     55222.0 -0.234916
5         NaN       NaN
6     99575.0 -0.042715
7         NaN       NaN
8     30965.0 -0.340032
9     44538.0 -0.281215
10  1000000.0  3.859221
11    28110.0 -0.352404
12    99309.0 -0.043868
13    47266.0 -0.269393
14    72992.0 -0.157911
15    26910.0 -0.357604
16    20206.0 -0.386656
17    43419.0 -0.286064
18    70636.0 -0.168121
19    70015.0 -0.170812

Number of Outliers: 1
Outlier Rows:
      age     income  income_z age_bin
10  25.0  1000000.0  3.859221   25-35


In [23]:
#----------------------------------------------------------PROBLEM 3------------------------------------------------------------
bins = [18, 25, 35, 45, 60]
labels = ["18-25", "25-35", "35-45", "45-60"]

df["age_bin"] = pd.cut(df["age"], bins=bins, labels=labels, right=False)

result = df.groupby("age_bin").agg(
    count=("income", "count"),
    mean_income=("income", "mean"),
    median_income=("income", "median")
).reset_index()

print("Statistics by Age Bin:\n", result.sort_values("age_bin"))

Statistics by Age Bin:
   age_bin  count    mean_income  median_income
0   18-25      3   57133.000000        70015.0
1   25-35      3  358501.000000        44538.0
2   35-45      2   52878.500000        52878.5
3   45-60      7   47979.428571        43419.0


In [25]:
#----------------------------------------------------------PROBLEM 4------------------------------------------------------------
arr = np.arange(1, 13).reshape(3, 4)
print("Original Array:\n", arr)

# Shape, Size, Transpose, Flatten
print("\nShape:", arr.shape)
print("Size:", arr.size)
print("Transpose:\n", arr.T)
print("Flattened:", arr.flatten())

Original Array:
 [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]]

Shape: (3, 4)
Size: 12
Transpose:
 [[ 1  5  9]
 [ 2  6 10]
 [ 3  7 11]
 [ 4  8 12]]
Flattened: [ 1  2  3  4  5  6  7  8  9 10 11 12]


In [26]:
# Negative Indexing
print("\nLast row using negative index:", arr[-1])
try:
    print(arr[5])  
except Exception as e:
    print("Error during slicing:", e)


Last row using negative index: [ 9 10 11 12]
Error during slicing: index 5 is out of bounds for axis 0 with size 3


In [27]:
# Arithmetic Operations
print("\nBroadcasting Example:\n", arr + 10)
print("Dot Product:\n", np.dot(arr, arr.T))


Broadcasting Example:
 [[11 12 13 14]
 [15 16 17 18]
 [19 20 21 22]]
Dot Product:
 [[ 30  70 110]
 [ 70 174 278]
 [110 278 446]]


In [28]:
# Linear Algebra Operations (square matrix required)
square_arr = np.array([[1, 2], [3, 4]])
det = np.linalg.det(square_arr)
inv = np.linalg.inv(square_arr)

print("\nSquare Matrix:\n", square_arr)
print("Determinant:", det)
print("Inverse:\n", inv)


Square Matrix:
 [[1 2]
 [3 4]]
Determinant: -2.0000000000000004
Inverse:
 [[-2.   1. ]
 [ 1.5 -0.5]]
