In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd

In [3]:
train_df = pd.read_csv(filepath_or_buffer='response_code_train.csv')
cv_df = pd.read_csv(filepath_or_buffer='respose_code_cv.csv')
test_df = pd.read_csv(filepath_or_buffer='respose_code_test.csv')

In [4]:
display(train_df)

Unnamed: 0,State,Element,class
0,A,P,0
1,B,Q,1
2,C,R,2
3,A,P,0
4,A,P,1
5,B,Q,2
6,A,P,0
7,A,P,1
8,C,R,1
9,C,R,0


In [5]:
display(test_df)

Unnamed: 0,State,Element
0,A,P
1,C,R
2,D,S
3,C,R
4,B,Q
5,E,T


In [6]:
display(train_df['State'].value_counts().to_dict())

{'A': 5, 'C': 3, 'B': 2}

In [7]:
display(train_df[['State', 'class']].value_counts().to_dict())

{('A', 0): 3,
 ('A', 1): 2,
 ('B', 1): 1,
 ('B', 2): 1,
 ('C', 0): 1,
 ('C', 1): 1,
 ('C', 2): 1}

In [8]:
class ResponseCoding(object):
    """
    This class is responsible for response coding.
    """
    
    def __init__(self, train_df, test_df, df_cols, t_col):
        self.train_df = train_df
        self.test_df = test_df
        self.df_cols = df_cols
        self.t_col = t_col
        self.unique_targets = pd.unique(values=self.train_df[self.t_col])
    
    def perform_response_coding_on_column(self, col, df):
        """
        This method performs response coding on a given column and dataframe.
        """
        col_df = pd.DataFrame()
        total_dict = self.train_df[col].value_counts().to_dict()
        t_col_wise_dict = self.train_df[[col, self.t_col]].value_counts().to_dict()
        if df == 'test':
            df = self.test_df
        else:
            df = self.train_df
        for t_val in self.unique_targets:
            t_list = list()
            for c_val in df[col]:
                try:
                    if c_val not in total_dict.keys():
                        t_list.append(1 / len(self.unique_targets))
                    else:
                        t_list.append(t_col_wise_dict[(c_val, t_val)] / total_dict[c_val])
                except KeyError as ke:
                    t_list.append(0)
            col_df[col+'_'+str(t_val)] = t_list
        return col_df
    
    def perform_response_coding(self, df):
        """
        This method performs response coding for an entire dataframe.
        """
        dfs = list()
        for col in self.df_cols:
            col_df = self.perform_response_coding_on_column(col=col, df=df)
            dfs.append(col_df)
        rc_df = pd.concat(dfs, axis=1)
        return rc_df

In [9]:
rc = ResponseCoding(train_df=train_df, test_df=test_df, df_cols=['State', 'Element'], t_col='class')
print("Encoded train data.")
display(rc.perform_response_coding(df='train'))
print("Encoded test data.")
display(rc.perform_response_coding(df='test'))

Encoded train data.


Unnamed: 0,State_0,State_1,State_2,Element_0,Element_1,Element_2
0,0.6,0.4,0.0,0.6,0.4,0.0
1,0.0,0.5,0.5,0.0,0.5,0.5
2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,0.6,0.4,0.0,0.6,0.4,0.0
4,0.6,0.4,0.0,0.6,0.4,0.0
5,0.0,0.5,0.5,0.0,0.5,0.5
6,0.6,0.4,0.0,0.6,0.4,0.0
7,0.6,0.4,0.0,0.6,0.4,0.0
8,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
9,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333


Encoded test data.


Unnamed: 0,State_0,State_1,State_2,Element_0,Element_1,Element_2
0,0.6,0.4,0.0,0.6,0.4,0.0
1,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
4,0.0,0.5,0.5,0.0,0.5,0.5
5,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333


---

In [10]:
class ResponseCoding(object):
    """
    This class is responsible for response coding.
    """
    
    def __init__(self, train_df, cv_df, test_df, df_cols, t_col, alpha=1, factor=10):
        self.train_df = train_df
        self.cv_df = cv_df
        self.test_df = test_df
        self.df_cols = df_cols
        self.t_col = t_col
        self.unique_targets = pd.unique(values=self.train_df[self.t_col])
        self.alpha = alpha
        self.factor = factor
        self.k = len(self.unique_targets)
    
    def perform_response_coding_on_column(self, col, df, laplace=False):
        """
        This method performs response coding on a given column and dataframe.
        """
        col_df = pd.DataFrame()
        total_dict = self.train_df[col].value_counts().to_dict()
        t_col_wise_dict = self.train_df[[col, self.t_col]].value_counts().to_dict()
        if df == 'test':
            df = self.test_df
        elif df == 'cv':
            df = self.cv_df
        else:
            df = self.train_df
        for t_val in self.unique_targets:
            t_list = list()
            for c_val in df[col]:
                try:
                    if c_val not in total_dict.keys():
                        if laplace:
                            add_val = self.alpha * self.factor
                            n_val = (1 + add_val)
                            d_val = self.k + (add_val * self.k)
                            f_val = n_val / d_val
                        else:
                            f_val = 1 / self.k
                        t_list.append(f_val)
                    else:
                        if laplace:
                            add_val = self.alpha * self.factor
                            n_val = t_col_wise_dict[(c_val, t_val)] + add_val
                            d_val = total_dict[c_val] + (add_val * self.k)
                            f_val = n_val / d_val
                        else:
                            f_val = t_col_wise_dict[(c_val, t_val)] / total_dict[c_val]
                        t_list.append(f_val)
                except KeyError as ke:
                    t_list.append(0)
            col_df[col+'_'+str(t_val)] = t_list
        return col_df
    
    def perform_response_coding(self, df, laplace):
        """
        This method performs response coding for an entire dataframe.
        """
        dfs = list()
        for col in self.df_cols:
            col_df = self.perform_response_coding_on_column(col=col, df=df, laplace=laplace)
            dfs.append(col_df)
        rc_df = pd.concat(dfs, axis=1)
        return rc_df

In [11]:
rc = ResponseCoding(train_df=train_df, test_df=test_df, cv_df=cv_df, df_cols=['State', 'Element'], t_col='class')
print("Encoded train data.")
display(rc.perform_response_coding(df='train', laplace=False))
print("Encoded cv data.")
display(rc.perform_response_coding(df='cv', laplace=False))
print("Encoded test data.")
display(rc.perform_response_coding(df='test', laplace=False))

Encoded train data.


Unnamed: 0,State_0,State_1,State_2,Element_0,Element_1,Element_2
0,0.6,0.4,0.0,0.6,0.4,0.0
1,0.0,0.5,0.5,0.0,0.5,0.5
2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,0.6,0.4,0.0,0.6,0.4,0.0
4,0.6,0.4,0.0,0.6,0.4,0.0
5,0.0,0.5,0.5,0.0,0.5,0.5
6,0.6,0.4,0.0,0.6,0.4,0.0
7,0.6,0.4,0.0,0.6,0.4,0.0
8,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
9,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333


Encoded cv data.


Unnamed: 0,State_0,State_1,State_2,Element_0,Element_1,Element_2
0,0.6,0.4,0.0,0.6,0.4,0.0
1,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
4,0.0,0.5,0.5,0.0,0.5,0.5
5,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333


Encoded test data.


Unnamed: 0,State_0,State_1,State_2,Element_0,Element_1,Element_2
0,0.6,0.4,0.0,0.6,0.4,0.0
1,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
4,0.0,0.5,0.5,0.0,0.5,0.5
5,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333


In [12]:
rc = ResponseCoding(train_df=train_df, test_df=test_df, cv_df=cv_df, df_cols=['State', 'Element'], t_col='class')
print("Encoded train data.")
display(rc.perform_response_coding(df='train', laplace=True))
print("Encoded cv data.")
display(rc.perform_response_coding(df='cv', laplace=True))
print("Encoded test data.")
display(rc.perform_response_coding(df='test', laplace=True))

Encoded train data.


Unnamed: 0,State_0,State_1,State_2,Element_0,Element_1,Element_2
0,0.371429,0.342857,0.0,0.371429,0.342857,0.0
1,0.0,0.34375,0.34375,0.0,0.34375,0.34375
2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,0.371429,0.342857,0.0,0.371429,0.342857,0.0
4,0.371429,0.342857,0.0,0.371429,0.342857,0.0
5,0.0,0.34375,0.34375,0.0,0.34375,0.34375
6,0.371429,0.342857,0.0,0.371429,0.342857,0.0
7,0.371429,0.342857,0.0,0.371429,0.342857,0.0
8,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
9,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333


Encoded cv data.


Unnamed: 0,State_0,State_1,State_2,Element_0,Element_1,Element_2
0,0.371429,0.342857,0.0,0.371429,0.342857,0.0
1,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
4,0.0,0.34375,0.34375,0.0,0.34375,0.34375
5,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333


Encoded test data.


Unnamed: 0,State_0,State_1,State_2,Element_0,Element_1,Element_2
0,0.371429,0.342857,0.0,0.371429,0.342857,0.0
1,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
2,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
3,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
4,0.0,0.34375,0.34375,0.0,0.34375,0.34375
5,0.333333,0.333333,0.333333,0.333333,0.333333,0.333333
