In [1]:
import numpy as np

## *5-Numpy for Data Cleaning*

Identifying Missing Value

In [13]:
data = np.array([1, np.nan,3, np.nan,5])
data

array([ 1., nan,  3., nan,  5.])

In [17]:
has_missing = np.isnan(data)
has_missing

array([False,  True, False,  True, False])

Removing rows of column with missing values

In [34]:
data = np.array([[1, 2, 3], [4, np.nan, 6], [7, 8, 9]])
data
cleaned_data = data[~np.any(np.isnan(data), axis = 1)]
cleaned_data

array([[1., 2., 3.],
       [7., 8., 9.]])

## *6 - Numpy for statistical Analysis*

Data Transformation

In [44]:
data = np.array([10,20,30,40,50])
mean = np.mean(data)
centered_data = data - mean
centered_data

array([-20., -10.,   0.,  10.,  20.])

In [46]:
std_dev = np.std(data)
standarized_data = centered_data/std_dev
standarized_data

array([-1.41421356, -0.70710678,  0.        ,  0.70710678,  1.41421356])

In [48]:
np.log(data)

array([2.30258509, 2.99573227, 3.40119738, 3.68887945, 3.91202301])

Random Sampling And Generation

In [98]:
#Simple Random Sampling without replacement:
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
random_samples = np.random.choice(data, size = 5, replace = 'false')
random_samples


array([6, 1, 7, 4, 5])

In [134]:
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
random_samples = np.random.choice(data, size = (2,3), replace = 'false')
random_samples

array([[ 1, 10,  8],
       [ 5,  4,  4]])

In [140]:
num_samples = 1000
bootstrap_samples = np.random.choice(data, size=(num_samples, len(data)), replace=True)
bootstrap_samples


array([[ 8,  2,  9, ...,  1,  1, 10],
       [ 6, 10,  8, ...,  2,  4,  2],
       [ 3,  6,  9, ...,  9,  4,  6],
       ...,
       [ 1,  7,  7, ...,  9,  4, 10],
       [ 8, 10,  1, ...,  3,  5,  2],
       [ 5,  8,  5, ...,  4,  9,  6]])

**Generation**

In [143]:
np.random.randint(0,100)

71

In [147]:
mean = 0
std = 1
norm_val = np.random.normal(mean, std, 5)
norm_val

array([ 1.28687435,  0.23476354, -0.97353678,  0.9413084 ,  0.17542113])

In [151]:
 # Generates 5 random values from a standard normal distribution
mean = 0
std_dev = 1
normal_values = np.random.normal(mean, std_dev, 5)
print(normal_values)


[ 0.40502116 -0.05914448 -1.26963932  2.08999637  0.41916549]


In [159]:
 # Simulates 5 sets of 10 trials with a success probability of 0.5
n_trials = 10
probability = 0.5
binomial_values = np.random.binomial(n_trials, probability, 5)
print(binomial_values)


[6 6 7 6 5]


In [161]:
rate = 2.5
poisson_values = np.random.poisson(rate, 5)
print(poisson_values)


[2 4 4 1 4]


In [167]:
# Generates 5 random values following an exponential distribution with a scale parameter of 0.5
scale_parameter = 0.5
exponential_values = np.random.exponential(scale_parameter, 5)
print(exponential_values)


[0.23152467 1.99403709 0.8480773  0.74082179 0.32647104]


In [169]:
# Generates 5 random values following a log-normal distribution
mean_of_log = 0
std_dev_of_log = 0.5
lognormal_values = np.random.lognormal(mean_of_log, std_dev_of_log, 5)
print(lognormal_values)


[1.38796393 0.71815922 0.52587992 1.18957532 1.22893541]


In [171]:
# Simulates 5 sets of 10 multinomial trials with the given probabilities
n_trials = 10
probabilities = [0.2, 0.3, 0.5] # Probabilities of each outcome
multinomial_values = np.random.multinomial(n_trials, probabilities, 5)
print(multinomial_values)


[[2 4 4]
 [0 4 6]
 [1 4 5]
 [1 5 4]
 [3 2 5]]


## *7- Numpy for linear Algebra*`

In [180]:
A = np.array([[1,2], [3,4]])
A_inv =  np.linalg.inv(A)
A_inv

array([[-2. ,  1. ],
       [ 1.5, -0.5]])

In [184]:
A = np.array([[2, -1], [1, 1]])
eigenvalues, eigenvectors = np.linalg.eig(A)
print("eigenvalues:",eigenvalues)
print("eigenvectors:",eigenvectors)

eigenvalues: [1.5+0.8660254j 1.5-0.8660254j]
eigenvectors: [[0.35355339+0.61237244j 0.35355339-0.61237244j]
 [0.70710678+0.j         0.70710678-0.j        ]]


In [186]:
A = np.array([[1, 2], [3, 4], [5, 6]])

In [188]:
U, S, VT = np.linalg.svd(A)
U

array([[-0.2298477 ,  0.88346102,  0.40824829],
       [-0.52474482,  0.24078249, -0.81649658],
       [-0.81964194, -0.40189603,  0.40824829]])

In [190]:
S

array([9.52551809, 0.51430058])

In [192]:
VT

array([[-0.61962948, -0.78489445],
       [-0.78489445,  0.61962948]])

Solve Linear Equations

In [198]:
A = np.array([[2, 3], [4, 5]])
B = np.array([6, 7])

#Ax = B for x

x = np.linalg.solve(A, B)
x

array([-4.5,  5. ])

## *8-Advanced Numpy Techniques*

Masked Array

In [209]:
import numpy.ma as ma
# Temperature dataset with missing values (-999 represents missing values)
temperatures = np.array([22.5, 23.0, -999, 24.5, -999, 26.0, 27.2, -999, 28.5])
mean_temperature = np.mean(temperatures)
print("Mean Temperature (without handling missing values):", mean_temperature)

# Create a mask for missing values (-999)
mask = (temperatures == -999)

# Create a masked array
masked_temperatures = ma.masked_array(temperatures, mask=mask)
# Calculate the mean temperature (excluding missing values)
mean_temperature = ma.mean(masked_temperatures)
# Print the result = 25.28
print("Mean Temperature (excluding missing values):", mean_temperature)

Mean Temperature (without handling missing values): -316.14444444444445
Mean Temperature (excluding missing values): 25.28333333333333


Structured Arrays

In [213]:
#Define data types for fields
dt = np.dtype([('name', 'S20'), ('age', int), ('salary', float)])
# Create a structured array
employees = np.array([('Alice', 30, 50000.0), ('Bob', 25, 60000.0)], dtype=dt)
# Access the 'name' field of the first employee
print(employees['name'][0])
# Access the 'age' field of all employees
print(employees['age'])

b'Alice'
[30 25]
