<a href="https://colab.research.google.com/github/pankajalwr/advance-python/blob/main/encoders_assign.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder

In [None]:
df=pd.read_csv("/content/covid_toy.csv")

In [None]:
df

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No
...,...,...,...,...,...,...
95,12,Female,104.0,Mild,Bangalore,No
96,51,Female,101.0,Strong,Kolkata,Yes
97,20,Female,101.0,Mild,Bangalore,No
98,5,Female,98.0,Strong,Mumbai,No


In [None]:
df.select_dtypes(include=['object','category'])

Unnamed: 0,gender,cough,city,has_covid
0,Male,Mild,Kolkata,No
1,Male,Mild,Delhi,Yes
2,Male,Mild,Delhi,No
3,Female,Mild,Kolkata,No
4,Female,Mild,Mumbai,No
...,...,...,...,...
95,Female,Mild,Bangalore,No
96,Female,Strong,Kolkata,Yes
97,Female,Mild,Bangalore,No
98,Female,Strong,Mumbai,No


In [None]:
df.select_dtypes(include=['number','Float64','int64'])

Unnamed: 0,age,fever
0,60,103.0
1,27,100.0
2,42,101.0
3,31,98.0
4,65,101.0
...,...,...
95,12,104.0
96,51,101.0
97,20,101.0
98,5,98.0


In [None]:
class DataPreprocessor:
    def __init__(self, df):
        self.df = df.copy()
        self.categorical_columns = self.df.select_dtypes(include=['object', 'category']).columns
        self.numerical_columns = self.df.select_dtypes(include=['int64', 'float64']).columns
        self.scaler = StandardScaler()
        self.label_encoders = {}

    def fill_missing_values(self):
        for col in self.categorical_columns:
            self.df[col].fillna(self.df[col].mode()[0], inplace=True)
        for col in self.numerical_columns:
            self.df[col].fillna(self.df[col].median(), inplace=True)

    def encode_categorical(self):
        for col in self.categorical_columns:
            le = LabelEncoder()
            self.df[col] = le.fit_transform(self.df[col])
            self.label_encoders[col] = le

    def standardize_numerical(self):
        self.df[self.numerical_columns] = self.scaler.fit_transform(self.df[self.numerical_columns])

    def preprocess(self):
        self.fill_missing_values()
        self.encode_categorical()
        self.standardize_numerical()
        return self.df


In [None]:
class preprocessing:
    def __init__(self,df):
        self.df=df

    def separate(self):
        list1=[]
        list2=[]
        for i in df.columns:
            if(self.df[i].dtypes in ["float64", "int64"]):
                list1.append(i)
            else:
                list2.append(i)

        df_numerical = self.df[list1]
        df_categorical = self.df[list2]

        return df_numerical, df_categorical

    def standardize(self, df_numerical):
        scaler = StandardScaler()
        standardized_data = scaler.fit_transform(df_numerical)
        df_numerical_scaled = pd.DataFrame(standardized_data,
                                           columns=df_numerical.columns)
        return df_numerical_scaled

    def encode(self, df_categorical):
        encoder = OneHotEncoder(drop="first", sparse_output=False)
        encoded_data = encoder.fit_transform(df_categorical)
        df_categorical_encoded = pd.DataFrame(encoded_data,
                                              columns=encoder.get_feature_names_out(df_categorical.columns))
        return df_categorical_encoded

df = pd.read_csv("/content/covid_toy.csv")

preprocessor=preprocessing(df)
numerical_data, categorical_data = preprocessor.separate()
numerical_data_scaled = preprocessor.standardize(numerical_data)
categorical_data_encoded = preprocessor.encode(categorical_data)

# print(numerical_data_scaled.head())
# print(categorical_data_encoded.head())
print(np.round(numerical_data_scaled.describe(),2))
print(np.round(categorical_data_encoded.describe(),2))

          age  fever
count  100.00  90.00
mean     0.00  -0.00
std      1.01   1.01
min     -1.58  -1.39
25%     -0.98  -0.90
50%      0.03   0.08
75%      0.90   0.93
max      1.61   1.54
       gender_Male  cough_Strong  city_Delhi  city_Kolkata  city_Mumbai  \
count       100.00        100.00      100.00        100.00       100.00   
mean          0.41          0.38        0.22          0.32         0.16   
std           0.49          0.49        0.42          0.47         0.37   
min           0.00          0.00        0.00          0.00         0.00   
25%           0.00          0.00        0.00          0.00         0.00   
50%           0.00          0.00        0.00          0.00         0.00   
75%           1.00          1.00        0.00          1.00         0.00   
max           1.00          1.00        1.00          1.00         1.00   

       has_covid_Yes  
count         100.00  
mean            0.45  
std             0.50  
min             0.00  
25%             0.00