In [224]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [226]:
data = {
    'Name': ['Alice', 'Bob', np.nan, 'David', np.nan, 'Eva', 'Frank', 'Grace', 'Harry', np.nan],
    'Age': [25, np.nan, 28, 35, 22, 30, np.nan, 27, 29, 31],
    'Salary': [50000, 54000, np.nan, 58000, 60000, 62000, np.nan, 65000, 67000, np.nan]
}

# b) Convert to DataFrame
df = pd.DataFrame(data)
print(df)

    Name   Age   Salary
0  Alice  25.0  50000.0
1    Bob   NaN  54000.0
2    NaN  28.0      NaN
3  David  35.0  58000.0
4    NaN  22.0  60000.0
5    Eva  30.0  62000.0
6  Frank   NaN      NaN
7  Grace  27.0  65000.0
8  Harry  29.0  67000.0
9    NaN  31.0      NaN


In [228]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
print (df)

    Name     Age   Salary
0  Alice  25.000  50000.0
1    Bob  28.375  54000.0
2    NaN  28.000      NaN
3  David  35.000  58000.0
4    NaN  22.000  60000.0
5    Eva  30.000  62000.0
6  Frank  28.375      NaN
7  Grace  27.000  65000.0
8  Harry  29.000  67000.0
9    NaN  31.000      NaN


In [230]:
df.dropna(subset=['Name'], inplace=True)
print (df)

    Name     Age   Salary
0  Alice  25.000  50000.0
1    Bob  28.375  54000.0
3  David  35.000  58000.0
5    Eva  30.000  62000.0
6  Frank  28.375      NaN
7  Grace  27.000  65000.0
8  Harry  29.000  67000.0


In [236]:
df['Salary'] = df['Salary'].ffill()
print("Clean Data")
print (df)

Clean Data
    Name     Age   Salary
0  Alice  25.000  50000.0
1    Bob  28.375  54000.0
3  David  35.000  58000.0
5    Eva  30.000  62000.0
6  Frank  28.375  62000.0
7  Grace  27.000  65000.0
8  Harry  29.000  67000.0


In [238]:
data = {
    'ID': [101, 102, 103, 104, 105, 101, 102, 106, 107, 103],
    'Score': [88, 92, 75, 85, 90, 88, 92, 95, 89, 75]
}

# b) Convert the dictionary into a DataFrame
df = pd.DataFrame(data)

In [192]:
df_unique = df.drop_duplicates(subset='ID', keep='first')

# d) Print the cleaned DataFrame
print("DataFrame without duplicates based on 'ID':")
print(df_unique)

DataFrame without duplicates based on 'ID':
    ID  Score
0  101     88
1  102     92
2  103     75
3  104     85
4  105     90
7  106     95
8  107     89


In [194]:
# Sample dataset
data = {
    'Salary': [30000, 50000, 70000, 100000, 150000],
    'Department': ['HR', 'IT', 'Finance', 'IT', 'HR'],
    'Revenue': [1000, 1200, 5000, 8000, 15000]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

Original DataFrame:
   Salary Department  Revenue
0   30000         HR     1000
1   50000         IT     1200
2   70000    Finance     5000
3  100000         IT     8000
4  150000         HR    15000


In [196]:
scaler = MinMaxScaler()
df['Normalized_Salary'] = scaler.fit_transform(df[['Salary']])
print (df)

   Salary Department  Revenue  Normalized_Salary
0   30000         HR     1000           0.000000
1   50000         IT     1200           0.166667
2   70000    Finance     5000           0.333333
3  100000         IT     8000           0.583333
4  150000         HR    15000           1.000000


In [198]:
df_encoded = pd.get_dummies(df, columns=['Department'], prefix='Dept')
print (df_encoded)

   Salary  Revenue  Normalized_Salary  Dept_Finance  Dept_HR  Dept_IT
0   30000     1000           0.000000         False     True    False
1   50000     1200           0.166667         False    False     True
2   70000     5000           0.333333          True    False    False
3  100000     8000           0.583333         False    False     True
4  150000    15000           1.000000         False     True    False


In [200]:
df_encoded['Log_Revenue'] = np.log(df_encoded['Revenue'] + 1)  
print (df_encoded)

   Salary  Revenue  Normalized_Salary  Dept_Finance  Dept_HR  Dept_IT  \
0   30000     1000           0.000000         False     True    False   
1   50000     1200           0.166667         False    False     True   
2   70000     5000           0.333333          True    False    False   
3  100000     8000           0.583333         False    False     True   
4  150000    15000           1.000000         False     True    False   

   Log_Revenue  
0     6.908755  
1     7.090910  
2     8.517393  
3     8.987322  
4     9.615872  


In [202]:
data = {
    'F1': [1, 1, 1, 1, 1],               # Low variance
    'F2': [1, 2, 3, 4, 5],               # High variance
    'F3': [2, 4, 6, 8, 10],              # High variance
    'Category': ['A', 'B', 'A', 'B', 'A'],
    'Score': [10, 20, 30, 40, 50]
}
df = pd.DataFrame(data)
print("Original Data:")
print(df)

Original Data:
   F1  F2  F3 Category  Score
0   1   1   2        A     10
1   1   2   4        B     20
2   1   3   6        A     30
3   1   4   8        B     40
4   1   5  10        A     50


In [204]:
features = df[['F1', 'F2', 'F3']]
selector = VarianceThreshold(threshold=0.0)
selected = selector.fit_transform(features)
selected_cols = features.columns[selector.get_support()]
df_selected = pd.DataFrame(selected, columns=selected_cols)
print("\nAfter Variance Threshold:")
print(df_selected)


After Variance Threshold:
   F2  F3
0   1   2
1   2   4
2   3   6
3   4   8
4   5  10


In [206]:
scaler = StandardScaler()
scaled = scaler.fit_transform(df_selected)

pca = PCA(n_components=2)
df_pca = pd.DataFrame(pca.fit_transform(scaled), columns=['PC1', 'PC2'])
print("\nAfter PCA (2 components):")
print(df_pca)


After PCA (2 components):
   PC1           PC2
0 -2.0  1.489520e-16
1 -1.0 -4.965068e-17
2  0.0  0.000000e+00
3  1.0  4.965068e-17
4  2.0  9.930137e-17


In [208]:
df['Mean_Score'] = df['Score']
df_agg = df.groupby('Category').mean(numeric_only=True).reset_index()
print("\nAfter Aggregation (Group by Category):")
print(df_agg)


After Aggregation (Group by Category):
  Category   F1   F2   F3  Score  Mean_Score
0        A  1.0  3.0  6.0   30.0        30.0
1        B  1.0  3.0  6.0   30.0        30.0
