***3.A Apply feature-scaling techniques like standardization and normalization to numerical features.***

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn import preprocessing # Note: The 'preprocessing' alias is not used in the provided code snippet

# Assuming the file '/content/test.csv' is available
df = pd.read_csv("/content/test (1).csv")

print("\nFirst 5 Rows of Dataset:",df.head())
print(" Dataset Shape:", df.shape)

# Select all numerical features (int and float)
numerical_cols = df.select_dtypes(include=np.number).columns.tolist()
# Drop 'Id' as it's an identifier
numerical_df = df[numerical_cols].drop('Id', axis=1)

# Feature selected for scaling/standardization (GrLivArea has no NaNs)
feature_name = 'GrLivArea'
feature = numerical_df[[feature_name]].values # Extract as 2D array

print("--- Initial Dataset Information ---")
print(f"Feature '{feature_name}' head:\n{feature[:5].flatten()}")
print(f"Feature '{feature_name}' min: {feature.min()}, max: {feature.max()}\n")


print("\n--- A. Standardizing a Feature (GrLivArea) ---")
# Create and fit scaler
standard_scaler = StandardScaler()
standardized_feature = standard_scaler.fit_transform(feature)
print("Standardized Feature (first 5 values):")
print(standardized_feature[:5].flatten())
print(f"Mean after standardization: {standardized_feature.mean():.4f}")
print(f"Standard Deviation after standardization: {standardized_feature.std():.4f}")


print("\n\n--- B. Rescaling a Feature (GrLivArea) ---")
# Create and fit scaler for [0, 1] range
minmax_scaler = MinMaxScaler(feature_range=(0, 1))
rescaled_feature = minmax_scaler.fit_transform(feature)
print("Rescaled Feature (first 5 values):")
print(rescaled_feature[:5].flatten())
print(f"Min value after rescaling: {rescaled_feature.min():.4f}")
print(f"Max value after rescaling: {rescaled_feature.max():.4f}")


print("\n\n--- C. Imputing Missing Numerical Values (LotFrontage) ---")
impute_feature_name = 'LotFrontage'
impute_feature = numerical_df[[impute_feature_name]].copy() # Use a copy

# Check initial missing values
initial_nans = impute_feature.isnull().sum().iloc[0]
print(f"Number of NaN in original '{impute_feature_name}': {initial_nans}")

# Create and fit imputer with 'mean' strategy
mean_imputer = SimpleImputer(strategy="mean")
imputed_feature = mean_imputer.fit_transform(impute_feature)

# Convert back to Series for comparison
imputed_series = pd.Series(imputed_feature.flatten(), name=impute_feature_name + '_Imputed')
print(f"Mean used for imputation: {mean_imputer.statistics_[0]:.2f}")
print(f"Number of NaN in imputed '{impute_feature_name}': {imputed_series.isnull().sum()}")
print(f"First 5 rows of imputed '{impute_feature_name}':")
print(imputed_series.head())


print("\n\n--- D. Deleting Observations with Missing Values ---")
# Drop rows where *any* numerical feature has a missing value
df_dropped_na = numerical_df.dropna()
print(f"Original number of observations (rows): {numerical_df.shape[0]}")
print(f"Number of observations after dropping NaNs: {df_dropped_na.shape[0]}")
print("First 5 rows of the dataset after dropping NaNs:")
print(df_dropped_na.head())


First 5 Rows of Dataset:      Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   

  LandContour Utilities  ... ScreenPorch PoolArea PoolQC  Fence MiscFeature  \
0         Lvl    AllPub  ...         120        0    NaN  MnPrv         NaN   
1         Lvl    AllPub  ...           0        0    NaN    NaN        Gar2   
2         Lvl    AllPub  ...           0        0    NaN  MnPrv         NaN   
3         Lvl    AllPub  ...           0        0    NaN    NaN         NaN   
4         HLS    AllPub  ...         144        0    NaN    NaN         NaN   

  MiscVal MoSold  YrSold  SaleType  SaleCo

## ***3.B Perform feature dummification to convert categorical variables into numerical representations.***

In [2]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer # Note: LabelBinarizer is imported but not used in the code
# The code primarily uses pandas.get_dummies and pandas.replace.

print("><><>< NITIN KUMAR BERA T072 ><><><")
print("3.B Perform feature dummificaƟon to convert categorical variables into numerical representaƟons.")

# --- 1. Load the Dataset ---
# Assuming the file '/content/test.csv' is available
df = pd.read_csv("/content/test (1).csv")

print("\nDataset Loaded:")
print(df.head(10))
print(" Dataset Shape:", df.shape)

# --- Initial Inspection ---
print("--- Initial Dataset Information ---")
print(df[['MSZoning', 'ExterQual']].head(10))
print("\nMSZoning missing values:", df['MSZoning'].isnull().sum())
print("ExterQual unique values:", df['ExterQual'].unique())


print("\n\n--- A. Encoding Nominal Categorical Feature (MSZoning) ---")
# Use pd.get_dummies on 'MSZoning' (a nominal feature)
# Note: NaN rows are dropped here for a clean demonstration of the encoding process
ms_zoning_nominal = df['MSZoning'].dropna()
ms_zoning_one_hot = pd.get_dummies(ms_zoning_nominal, prefix='MSZoning')

print("\nOne-Hot Encoded 'MSZoning' (using pandas.get_dummies):")
print(ms_zoning_one_hot.head(10))


print("\n\n--- B. Encoding Ordinal Categorical Feature (ExterQual) ---")
# Define a manual mapping dictionary to assign numerical values based on order
scale_mapper = {
    "Ex": 4, # Excellent
    "Gd": 3, # Good
    "TA": 2, # Typical/Average
    "Fa": 1  # Fair
}
# Create a copy of the column and apply the replace method to encode the order
exter_qual_ordinal = df['ExterQual'].copy()
exter_qual_ordinal_encoded = exter_qual_ordinal.replace(scale_mapper)

print("\nOrdinal Encoded 'ExterQual' (using pandas.replace):")
print(exter_qual_ordinal_encoded.head(10))


print("\n\n--- C. Imputing Missing Categorical Values (MSZoning) ---")
# We use SimpleImputer with the 'most_frequent' strategy, as shown in the text.
# Extract the column and reshape for SimpleImputer (requires a 2D array)
ms_zoning_column = df['MSZoning'].values.reshape(-1, 1)

# Use SimpleImputer with 'most_frequent' strategy
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

# Fit and transform
ms_zoning_imputed = imputer.fit_transform(ms_zoning_column)

# Convert back to a pandas Series for inspection
df_ms_zoning_imputed = pd.Series(ms_zoning_imputed.flatten(), name='MSZoning_Imputed')

print("\nNumber of NaN in original 'MSZoning':", df['MSZoning'].isnull().sum())
print("Number of NaN in imputed 'MSZoning':", df_ms_zoning_imputed.isnull().sum())
print("Most frequent value used for imputation (mode):", imputer.statistics_[0])
print("\nFirst 10 rows of imputed 'MSZoning':")
print(df_ms_zoning_imputed.head(10))

><><>< NITIN KUMAR BERA T072 ><><><
3.B Perform feature dummificaƟon to convert categorical variables into numerical representaƟons.

Dataset Loaded:
     Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0  1461          20       RH         80.0    11622   Pave   NaN      Reg   
1  1462          20       RL         81.0    14267   Pave   NaN      IR1   
2  1463          60       RL         74.0    13830   Pave   NaN      IR1   
3  1464          60       RL         78.0     9978   Pave   NaN      IR1   
4  1465         120       RL         43.0     5005   Pave   NaN      IR1   
5  1466          60       RL         75.0    10000   Pave   NaN      IR1   
6  1467          20       RL          NaN     7980   Pave   NaN      IR1   
7  1468          60       RL         63.0     8402   Pave   NaN      IR1   
8  1469          20       RL         85.0    10176   Pave   NaN      Reg   
9  1470          20       RL         70.0     8400   Pave   NaN      Reg   

  LandContour

  exter_qual_ordinal_encoded = exter_qual_ordinal.replace(scale_mapper)
