Q3.Create a dataframe having at least 5 columns and 100 rows to store numeric data generated using a random
function. Replace 25% of the values by null values whose index positions are generated using random
function. Do the following:
a. Identify and count missing values in a dataframe.
b. Drop the column having more than 5 null values.
c. Identify the row label having maximum of the sum of all values in a row and drop that row.
d. Sort the data frame on the basis of the first column.
e. Remove all duplicates from the first column.
f. Find the correlation between first and second column and covariance between second and third column.
g. Detect the outliers and remove the rows having outliers.
h. Discretize second column and create 5 bins

In [1]:
import numpy as np
import pandas as pd

In [31]:
df = pd.DataFrame(np.random.randn(100, 4), columns=list('ABCD'))
print(df)

Unnamed: 0,A,B,C,D
0,0.015591,-1.479307,-0.367680,-0.312448
1,0.350199,-0.183949,0.004392,-0.398717
2,0.863621,-0.773861,1.811074,-0.827447
3,-1.937880,-0.694168,-0.463303,-0.431429
4,0.633002,0.667296,1.791493,-0.264167
...,...,...,...,...
95,0.506252,0.182678,-0.702553,0.571588
96,-0.012286,0.040046,1.898194,-1.492313
97,-0.242511,-0.580079,-0.821535,0.611981
98,-0.825759,-1.365934,-0.083021,1.116379


In [32]:
# Replace 25% of the values with NaN using random index positions
nan_indices = np.random.choice(df.size, int(0.25 * df.size), replace=False)
df.values.flat[nan_indices] = np.nan
df

Unnamed: 0,A,B,C,D
0,0.015591,-1.479307,-0.367680,-0.312448
1,0.350199,,0.004392,-0.398717
2,0.863621,-0.773861,1.811074,-0.827447
3,,,-0.463303,-0.431429
4,0.633002,0.667296,1.791493,
...,...,...,...,...
95,0.506252,0.182678,-0.702553,
96,,0.040046,1.898194,
97,,-0.580079,-0.821535,0.611981
98,-0.825759,,,1.116379


In [33]:
# a. Identify and count missing values
missing_values = df.isnull().sum()
print("Missing Values:\n", missing_values)

Missing Values:
 A    28
B    27
C    19
D    26
dtype: int64


In [34]:
# b. Drop the column having more than 5 null values
df_copy = df.copy()
df_copy = df_copy.dropna(thresh=len(df_copy) - 5, axis=1)
df_copy

0
1
2
3
4
...
95
96
97
98
99


In [35]:
df

Unnamed: 0,A,B,C,D
0,0.015591,-1.479307,-0.367680,-0.312448
1,0.350199,,0.004392,-0.398717
2,0.863621,-0.773861,1.811074,-0.827447
3,,,-0.463303,-0.431429
4,0.633002,0.667296,1.791493,
...,...,...,...,...
95,0.506252,0.182678,-0.702553,
96,,0.040046,1.898194,
97,,-0.580079,-0.821535,0.611981
98,-0.825759,,,1.116379


In [36]:
# c. Identify the row label having the maximum sum and drop that row
max_sum_row_label = df.sum(axis=1).idxmax()
df = df.drop(index=max_sum_row_label)
df

Unnamed: 0,A,B,C,D
0,0.015591,-1.479307,-0.367680,-0.312448
1,0.350199,,0.004392,-0.398717
2,0.863621,-0.773861,1.811074,-0.827447
3,,,-0.463303,-0.431429
4,0.633002,0.667296,1.791493,
...,...,...,...,...
95,0.506252,0.182678,-0.702553,
96,,0.040046,1.898194,
97,,-0.580079,-0.821535,0.611981
98,-0.825759,,,1.116379


In [38]:
# d. Sort the data frame based on the first column
df = df.sort_values(by='A')
df

Unnamed: 0,A,B,C,D
48,-2.400040,0.715728,-1.778283,1.160995
81,-2.120535,0.222039,-0.328310,
32,-1.458717,-0.658589,-1.612907,1.706667
57,-1.269570,-0.106541,1.810299,-0.392813
59,-1.187609,,,
...,...,...,...,...
74,,-0.904301,,
77,,-0.832773,1.176206,1.367595
80,,1.459261,1.440048,
96,,0.040046,1.898194,


In [40]:
# e. Remove duplicates from the first column
df = df.drop_duplicates(subset='A')
df

Unnamed: 0,A,B,C,D
48,-2.400040,0.715728,-1.778283,1.160995
81,-2.120535,0.222039,-0.328310,
32,-1.458717,-0.658589,-1.612907,1.706667
57,-1.269570,-0.106541,1.810299,-0.392813
59,-1.187609,,,
...,...,...,...,...
42,1.405216,-0.916618,0.534813,-0.061014
76,1.408545,1.062346,0.182215,-1.501721
14,1.757567,-0.367318,,0.113410
83,2.531210,0.445951,-0.820807,-1.468999


In [41]:
# f Find the correlation and covariance
correlation = df['A'].corr(df['B'])
covariance = df['A'].cov(df['B'])
print("Correlation between A and B:", correlation)
print("Covariance between B and C:", covariance)

Correlation between A and B: -0.10846341189901977
Covariance between B and C: -0.09690053432797642


In [47]:
# g. Detect outliers and remove rows
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
df_out = df[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
df_out    

Unnamed: 0,A,B,C,D
81,-2.120535,0.222039,-0.328310,
32,-1.458717,-0.658589,-1.612907,1.706667
57,-1.269570,-0.106541,1.810299,-0.392813
59,-1.187609,,,
55,-1.152310,0.580469,-0.466194,-0.647154
...,...,...,...,...
45,1.314738,1.100720,-1.449854,
42,1.405216,-0.916618,0.534813,-0.061014
76,1.408545,1.062346,0.182215,-1.501721
14,1.757567,-0.367318,,0.113410


In [50]:
# h. Discretize the second column and create 5 bins
df['B_bins'] = pd.cut(df['B'], bins=5)
df

Unnamed: 0,A,B,C,D,B_bins
81,-2.120535,0.222039,-0.328310,,"(-0.444, 0.39]"
32,-1.458717,-0.658589,-1.612907,1.706667,"(-1.278, -0.444]"
57,-1.269570,-0.106541,1.810299,-0.392813,"(-0.444, 0.39]"
59,-1.187609,,,,
55,-1.152310,0.580469,-0.466194,-0.647154,"(0.39, 1.223]"
...,...,...,...,...,...
45,1.314738,1.100720,-1.449854,,"(0.39, 1.223]"
42,1.405216,-0.916618,0.534813,-0.061014,"(-1.278, -0.444]"
76,1.408545,1.062346,0.182215,-1.501721,"(0.39, 1.223]"
14,1.757567,-0.367318,,0.113410,"(-0.444, 0.39]"
