# simple demo of cudf min-max scaler to [ 0 , 1 ] range 
> https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html

> quite inelegant at the moment -- uses string based building of kernel & apply_rows
> ! no bounds checking

In [1]:
import numpy as np
from cudf.dataframe import DataFrame
import sklearn

In [2]:
nItems = 10
df = DataFrame()

df['dim1'] = np.arange(nItems)
df['dim2'] = np.arange(nItems)*np.random.randint(2, size=nItems)
df['dim3'] = np.arange(nItems)*np.random.randint(2, size=nItems)*3+1

In [3]:
print(df)

   dim1 dim2 dim3
 0    0    0    1
 1    1    1    1
 2    2    0    7
 3    3    0    1
 4    4    0    1
 5    5    0   16
 6    6    0   19
 7    7    7    1
 8    8    8   25
 9    9    0    1


# kernel [ applied  per row ]

In [4]:
def kernel(dim1, dim2, dim3, 
           scaled_dim1, scaled_dim2, scaled_dim3,
          min1, min2, min3, max1, max2, max3):
     for iRow, (d1, d2, d3) in enumerate( zip(dim1, dim2, dim3) ):
         scaled_dim1[iRow] = ( d1 - min1 ) / (max1 - min1)
         scaled_dim2[iRow] = ( d2 - min2 ) / (max2 - min2)
         scaled_dim3[iRow] = ( d3 - min3 ) / (max3 - min3)

In [5]:
min1 = min(df['dim1']); min2 = min(df['dim2']); min3 = min(df['dim3'])
max1 = max(df['dim1']); max2 = max(df['dim2']); max3 = max(df['dim3'])

outdf = df.apply_rows(kernel,
                       incols = ['dim1', 'dim2', 'dim3'],
                       outcols = dict(scaled_dim1 = np.float64,
                                    scaled_dim2 = np.float64,
                                    scaled_dim3 = np.float64),
                       kwargs = dict( min1 = min1, min2 = min2, min3 = min3, 
                                      max1 = max1, max2 = max2, max3 = max3) )

In [6]:
print(outdf)

   dim1 dim2 dim3        scaled_dim1 scaled_dim2 scaled_dim3
 0    0    0    1                0.0         0.0         0.0
 1    1    1    1 0.1111111111111111       0.125         0.0
 2    2    0    7 0.2222222222222222         0.0        0.25
 3    3    0    1 0.3333333333333333         0.0         0.0
 4    4    0    1 0.4444444444444444         0.0         0.0
 5    5    0   16 0.5555555555555556         0.0       0.625
 6    6    0   19 0.6666666666666666         0.0        0.75
 7    7    7    1 0.7777777777777778       0.875         0.0
 8    8    8   25 0.8888888888888888         1.0         1.0
 9    9    0    1                1.0         0.0         0.0


In [7]:
def build_min_max_kernel( targetDF ):    
    
    columnInputList = []; columnOutputList = []
    minInputList = []; maxInputList = []
    rowDimsList = []   

    for iCol in range( targetDF.columns.size ):
        columnInputList += [ targetDF.columns[iCol] ]        
        columnOutputList += [ 'scaled_'+targetDF.columns[iCol] ]                
        minInputList += [ 'min_' + targetDF.columns[iCol] ]
        maxInputList += [ 'max_' + targetDF.columns[iCol] ]        
        rowDimsList += [ 'd' + str(iCol) ]
    
    columnInputStr = ', '.join(columnInputList)
    columnOutputStr = ', '.join(columnOutputList)
    minInputStr = ', '.join(minInputList)
    maxInputStr = ', '.join(maxInputList)
    rowDimsStr = ', '.join(rowDimsList)
    
    ''' build kernel string '''
    kernelHeaderStr = 'def kernel( ' + \
                        ', '.join( [columnInputStr, columnOutputStr, minInputStr, maxInputStr]) \
                        + ' ):\n'
    
    kernelBodyStr = '\tfor iRow, (' + rowDimsStr + ') in enumerate(zip( '+ columnInputStr + ') ):\n'
    for iCol in range( targetDF.columns.size ):
        kernelBodyStr += '\t\t' + columnOutputList[iCol] + '[iRow] = ' + \
            '( ' + rowDimsList[iCol] + ' - ' + minInputList[iCol] + ') / ' + \
            '( ' + maxInputList[iCol] + ' - ' + minInputList[iCol] + ') \n'
        
    kernelStr = kernelHeaderStr + kernelBodyStr
    
    ''' build apply rows string'''
    columnInputList = ["'" + item + "'" for item in columnInputList]
    incolsStr = ', '.join(columnInputList) 
    outcolsList = []
    minValList = []
    maxValList = []
    for iCol in  range ( targetDF.columns.size ):
        outcolsList += [ columnOutputList[iCol] + ' = np.float64' ]
        minValList += [ minInputList[iCol] + ' = ' + str( min(targetDF[targetDF.columns[iCol]])) ]
        maxValList += [ maxInputList[iCol] + ' = ' + str( max(targetDF[targetDF.columns[iCol]])) ]
    
    outcolsStr = ', '.join(outcolsList)    
    kwargsStr = ', '.join( minValList) +', ' + ', '.join( maxValList)

    applyRowsStr = "apply_rows( kernel, incols = [" + incolsStr + "], outcols = dict(" + outcolsStr + "), kwargs = dict(" + kwargsStr + "))"
    
    ''' execute kernel & apply rows command and return output dataframe '''
    localDict = {}
    exec(kernelStr, globals(), localDict)    
    localDict = {'targetDF' : targetDF, 'kernel' : localDict['kernel']}
    exec("outDF = targetDF." + applyRowsStr , globals(), localDict)
    return localDict['outDF']

In [8]:
print ( df )

   dim1 dim2 dim3
 0    0    0    1
 1    1    1    1
 2    2    0    7
 3    3    0    1
 4    4    0    1
 5    5    0   16
 6    6    0   19
 7    7    7    1
 8    8    8   25
 9    9    0    1


In [9]:
scaledDF = build_min_max_kernel( targetDF = df )
print( scaledDF )

   dim1 dim2 dim3        scaled_dim1 scaled_dim2 scaled_dim3
 0    0    0    1                0.0         0.0         0.0
 1    1    1    1 0.1111111111111111       0.125         0.0
 2    2    0    7 0.2222222222222222         0.0        0.25
 3    3    0    1 0.3333333333333333         0.0         0.0
 4    4    0    1 0.4444444444444444         0.0         0.0
 5    5    0   16 0.5555555555555556         0.0       0.625
 6    6    0   19 0.6666666666666666         0.0        0.75
 7    7    7    1 0.7777777777777778       0.875         0.0
 8    8    8   25 0.8888888888888888         1.0         1.0
 9    9    0    1                1.0         0.0         0.0
