Q1. What is the difference between Ordinal Encoding and Label Encoding? Provide an example of when you
might choose one over the other.

In [1]:
import pandas as pd

# Sample dataset
data = {
    'Education_Level': ['High School', 'Associate\'s Degree', 'Bachelor\'s Degree', 'Master\'s Degree', 'Ph.D.'],
    'Salary': [40000, 50000, 60000, 80000, 100000],
}

df = pd.DataFrame(data)

# Ordinal Encoding
education_mapping = {'High School': 1, 'Associate\'s Degree': 2, 'Bachelor\'s Degree': 3, 'Master\'s Degree': 4, 'Ph.D.': 5}
df['Education_Level_Ordinal'] = df['Education_Level'].map(education_mapping)

# Label Encoding
df['Education_Level_Label'] = pd.factorize(df['Education_Level'])[0] + 1

# Display the result
print(df)


      Education_Level  Salary  Education_Level_Ordinal  Education_Level_Label
0         High School   40000                        1                      1
1  Associate's Degree   50000                        2                      2
2   Bachelor's Degree   60000                        3                      3
3     Master's Degree   80000                        4                      4
4               Ph.D.  100000                        5                      5


Q2. Explain how Target Guided Ordinal Encoding works and provide an example of when you might use it in
a machine learning project.

In [2]:
import pandas as pd

# Sample dataset
data = {
    'Credit_Score_Categories': ['Poor', 'Fair', 'Good', 'Excellent', 'Poor', 'Excellent'],
    'Default': [1, 0, 1, 0, 1, 0],  # 1: Default, 0: No Default
}

df = pd.DataFrame(data)

# Target Guided Ordinal Encoding
mean_target_by_category = df.groupby('Credit_Score_Categories')['Default'].mean().sort_values()
ordinal_mapping = {category: rank for rank, (category, _) in enumerate(mean_target_by_category.items())}

df['Credit_Score_Encoded'] = df['Credit_Score_Categories'].map(ordinal_mapping)

# Display the result
print(df)


  Credit_Score_Categories  Default  Credit_Score_Encoded
0                    Poor        1                     3
1                    Fair        0                     1
2                    Good        1                     2
3               Excellent        0                     0
4                    Poor        1                     3
5               Excellent        0                     0


Q3. Define covariance and explain why it is important in statistical analysis. How is covariance calculated?

In [3]:
import numpy as np

# Sample data
X = np.array([1, 2, 3, 4, 5])
Y = np.array([2, 3, 5, 4, 6])

# Calculate mean of X and Y
mean_X = np.mean(X)
mean_Y = np.mean(Y)

# Calculate covariance
covariance = np.sum((X - mean_X) * (Y - mean_Y)) / (len(X) - 1)

print("Covariance between X and Y:", covariance)


Covariance between X and Y: 2.25


Q4. For a dataset with the following categorical variables: Color (red, green, blue), Size (small, medium,
large), and Material (wood, metal, plastic), perform label encoding using Python's scikit-learn library.
Show your code and explain the output.

In [4]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Sample dataset
data = {
    'Color': ['red', 'green', 'blue', 'red', 'blue'],
    'Size': ['medium', 'small', 'large', 'medium', 'small'],
    'Material': ['wood', 'metal', 'plastic', 'metal', 'wood']
}

df = pd.DataFrame(data)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to each categorical column
df['Color_LabelEncoded'] = label_encoder.fit_transform(df['Color'])
df['Size_LabelEncoded'] = label_encoder.fit_transform(df['Size'])
df['Material_LabelEncoded'] = label_encoder.fit_transform(df['Material'])

# Display the result
print(df)


   Color    Size Material  Color_LabelEncoded  Size_LabelEncoded  \
0    red  medium     wood                   2                  1   
1  green   small    metal                   1                  2   
2   blue   large  plastic                   0                  0   
3    red  medium    metal                   2                  1   
4   blue   small     wood                   0                  2   

   Material_LabelEncoded  
0                      2  
1                      0  
2                      1  
3                      0  
4                      2  


Q5. Calculate the covariance matrix for the following variables in a dataset: Age, Income, and Education
level. Interpret the results.

In [5]:
import numpy as np
import pandas as pd

# Sample dataset
data = {
    'Age': [25, 30, 22, 35, 40],
    'Income': [50000, 60000, 40000, 70000, 80000],
    'Education_Level': [1, 2, 3, 2, 1],  # Assuming ordinal encoding: 1 - High School, 2 - Bachelor's, 3 - Master's
}

df = pd.DataFrame(data)

# Calculate the covariance matrix
covariance_matrix = np.cov(df, rowvar=False)

# Display the covariance matrix
print("Covariance Matrix:")
print(covariance_matrix)


Covariance Matrix:
[[ 5.33e+01  1.15e+05 -3.15e+00]
 [ 1.15e+05  2.50e+08 -7.50e+03]
 [-3.15e+00 -7.50e+03  7.00e-01]]


Q6. You are working on a machine learning project with a dataset containing several categorical
variables, including "Gender" (Male/Female), "Education Level" (High School/Bachelor's/Master's/PhD),
and "Employment Status" (Unemployed/Part-Time/Full-Time). Which encoding method would you use for
each variable, and why?

In [7]:
import pandas as pd
from category_encoders import BinaryEncoder

# Sample dataset
data = {
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
    'Education_Level': ['Bachelor\'s', 'Master\'s', 'PhD', 'Bachelor\'s', 'High School'],
    'Employment_Status': ['Full-Time', 'Part-Time', 'Unemployed', 'Part-Time', 'Full-Time'],
}

df = pd.DataFrame(data)

# Binary encoding for 'Gender'
binary_encoder = BinaryEncoder(cols=['Gender'])
df_encoded = binary_encoder.fit_transform(df)

print("Encoded DataFrame:")
print(df_encoded)


# Ordinal encoding for 'Education_Level'
education_mapping = {'High School': 1, 'Bachelor\'s': 2, 'Master\'s': 3, 'PhD': 4}
df['Education_Level_Encoded'] = df['Education_Level'].map(education_mapping)

# One-hot encoding for 'Employment_Status'
df_encoded = pd.get_dummies(df, columns=['Employment_Status'], prefix='Employment')


Encoded DataFrame:
   Gender_0  Gender_1 Education_Level Employment_Status
0         0         1      Bachelor's         Full-Time
1         1         0        Master's         Part-Time
2         0         1             PhD        Unemployed
3         1         0      Bachelor's         Part-Time
4         0         1     High School         Full-Time


Q7. You are analyzing a dataset with two continuous variables, "Temperature" and "Humidity", and two
categorical variables, "Weather Condition" (Sunny/Cloudy/Rainy) and "Wind Direction" (North/South/
East/West). Calculate the covariance between each pair of variables and interpret the results.

In [9]:
import pandas as pd

# Sample dataset
data = {
    'Temperature': [25, 20, 22, 28, 30],
    'Humidity': [60, 70, 75, 50, 65],
    'Weather_Condition': ['Sunny', 'Cloudy', 'Rainy', 'Sunny', 'Cloudy'],
    'Wind_Direction': ['North', 'South', 'East', 'West', 'North'],
}

df = pd.DataFrame(data)

# Select only numeric columns for covariance calculation
numeric_columns = df.select_dtypes(include=['number'])
covariance_matrix = numeric_columns.cov()

# Display covariance matrix
print("Covariance Matrix:")
print(covariance_matrix)


Covariance Matrix:
             Temperature  Humidity
Temperature         17.0     -25.0
Humidity           -25.0      92.5
