# Leave One Out Categorical Encoder

In [8]:
#Imports____________
import pandas as pd


## LOO Encoder Class

In [2]:
class Encoder_LOO():
    '''Leave-one-out encoder
    '''
    def __init__(self, RR):
        self.RR = RR
        self.cat_values = []
        self.target_sums = []
        self.sum_counts = []


    def fit(self, feature_column, target_column):
        '''Fits encoder to feature column and target column: 
            Returns the unique categorical values and the associated sums and counts'''
        
        self.cat_values = list(feature_column.unique())

        for cat in self.cat_values:
            
            cat_ind = feature_column==cat #Indeces of categories in feature column
            
            cat_sum = cat_ind.sum() #Number of categorical occurrences
            target_sum = target_column[cat_ind].sum()#Sum of target values for category
            
            #List updates
            self.sum_counts.append(cat_sum)
            self.target_sums.append(target_sum)
    
        return self
        
    def transform(self, feature_column, target_column = None):
        '''Category encoding using Leave one out method, for training set the 
        current record is left out, for testing it is included.
        
        ###Parameters
        feature_column: Pandas series of the feature column to be encoded
        target_column: Pandas series of the target column of the training set

            
        ###Returns
        df_output : Pandas dataframe of the column 'encoded' with encoded values.
        '''
        
        col_x = feature_column
        col_y = target_column
        
        encode_values = []
        
        #Testing Set: No target column is given
        if target_column is None:
            for i, xx in enumerate(col_x):
                    
                indx = self.cat_values.index(xx)
                count_i = self.sum_counts[indx]
                sum_i = self.target_sums[indx]
                
                ci = (sum_i)/(count_i+self.RR)
                encode_values.append(ci)
                
            df_output = pd.DataFrame()
            df_output['encoded'] = encode_values
            
            return df_output
        
        #Training Set
        else:
            for i, xx in enumerate(col_x):
 
                y_i = list(col_y.values)[i]
                indx = self.cat_values.index(xx)
                count_i = self.sum_counts[indx]
                sum_i = self.target_sums[indx]
                
                #If only one entry
                if count_i == 1:
                    ci = col_y.mean()
                
                else:
                    ci = (sum_i-y_i)/(count_i-1+self.RR)
                    
                encode_values.append(ci)
            df_output = pd.DataFrame()
            df_output['encoded'] = encode_values
            return df_output

### Create Dataframe

In [15]:
df_train = pd.DataFrame({'color':['Red','White','Pink','Green','Blue','Blue','Blue','Blue','Red','Red','Purple'], 
     'outcome':[1,0,0,1,2,1,1,0,0,1,0]})
                  
                  
df_test = pd.DataFrame({'color':['Green','Pink','White','Green','Red'], 
     'outcome':[2,2,1,1,1]} )                
                  
# set up X and y
X_train = df_train.drop('outcome', axis = 1)
y_train = df_train.drop('color', axis = 1)

X_test = df_test.drop('outcome', axis = 1)
y_test = df_test.drop('color', axis = 1)


### Encode Training and Testing Sets

In [27]:
#RR = regularization factor
RR = 0

ec = Encoder_LOO(RR=0)
ec.fit(X_train['color'], y_train['outcome'])
X_train_encode = ec.transform(X_train['color'], y_train['outcome'])
X_test_encode = ec.transform(X_test['color'])

df_train['encoded'] = X_train_encode
df_test['encoded'] = X_test_encode

print('Training set \n', df_train)
print('Testing set \n', df_test)



Training set 
      color  outcome   encoded
0      Red        1  0.500000
1    White        0  0.636364
2     Pink        0  0.636364
3    Green        1  0.636364
4     Blue        2  0.666667
5     Blue        1  1.000000
6     Blue        1  1.000000
7     Blue        0  1.333333
8      Red        0  1.000000
9      Red        1  0.500000
10  Purple        0  0.636364
Testing set 
    color  outcome   encoded
0  Green        2  1.000000
1   Pink        2  0.000000
2  White        1  0.000000
3  Green        1  1.000000
4    Red        1  0.666667
