In [1]:
import numpy as np 
import math
#Random seed for reproducibility
np.random.seed(1000)

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd



In [2]:
entropy_chunk = 2

def evaluateRenyiEntropy(stringComboAndFrequencyMap, numItems):
    renyiEntropySummand = 0

    for i in stringComboAndFrequencyMap:
        renyiEntropySummand += stringComboAndFrequencyMap[i] * (stringComboAndFrequencyMap[i] -1) / numItems / (numItems -1)

    return -1 * math.log2(renyiEntropySummand) if renyiEntropySummand > 0 else float('inf')

################################################
def formPartialString(inputString, positions):
    hash_data = []
    x_size = len(inputString)//entropy_chunk

    for k in range(len(positions)):
        if(positions[k] <= x_size):
            hash_data.append(inputString[positions[k] : positions[k] + entropy_chunk])

    hash_string = "".join(hash_data)
    hash_string += str(len(inputString))

    return hash_string
########################################################

def chooseBestPositionRenyi(maxLengthString, inputVec, currentPositions):
    bestEntropy = 0.0
    bestPosition = -1

    for i in range(maxLengthString//entropy_chunk):
        posUsed = False
        
        for position in currentPositions:
            if(position == i):
                posUsed = True
                break
        
        if(posUsed):
            continue
        
        testPositions = currentPositions.copy()
        testPositions.append(i)

        stringComboAndFrequencyMap = {}

        for x in inputVec:
            hash_string = formPartialString(x, testPositions)
            stringComboAndFrequencyMap[hash_string] = stringComboAndFrequencyMap.get(hash_string, 0) + 1

        # evaluate Renyi-Entropy
        positionRenyiEntropy = evaluateRenyiEntropy(stringComboAndFrequencyMap, len(inputVec))

        if(positionRenyiEntropy > bestEntropy):
            bestEntropy = positionRenyiEntropy
            bestPosition = i

    return bestPosition, bestEntropy

##   
def evaluateEntropy(testVec, positions):
    stringComboAndFrequencyMap = dict()
    for x in testVec:
        hash_string = formPartialString(x, positions)
        stringComboAndFrequencyMap[hash_string] = stringComboAndFrequencyMap.get(hash_string, 0) + 1

    return evaluateRenyiEntropy(stringComboAndFrequencyMap, len(testVec))

##
def greedyRenyiSelectorStopK(maxLengthString, inputVec, validationVec, returnPositions, returnEntropies, numPositions):
    entropy = 0.0

    for _ in range(numPositions):
        if(len(returnPositions) == (maxLengthString // entropy_chunk)):
            break

        nextPos, nextEntropy = chooseBestPositionRenyi(maxLengthString, inputVec, returnPositions)
        returnPositions.append(nextPos)

#        if(validationVec == inputVec):
        entropy = nextEntropy
#         else:
#             entropy = evaluateEntropy(validationVec, returnPositions)

        returnEntropies.append(entropy)

    return entropy

##
def greedyRenyiSelectorStopEntropy(maxLengthString, inputVec, returnPositions, desiredEntropy):
    entropy = 0.0

    while(entropy < desiredEntropy):
        if(len(returnPositions) == maxLengthString):
            break
        
        nextPos, nextEntropy = chooseBestPositionRenyi(maxLengthString, inputVec, returnPositions)
        returnPositions.append(nextPos)
        entropy = nextEntropy

    return entropy

In [3]:
df_set = pd.read_csv("/kaggle/input/vertical-set/new.csv", names = ["string"])
df_set['length'] = df_set["string"].apply(lambda x: len(x))

df_set['length'].min()

df_set

Unnamed: 0,string,length
0,768EC88A581E7F5EEA53D2FB7370B8AD0F,34
1,7685C487591D7959E94ECEFA736AB3B00D,34
2,7487C371561D7653E94CCBEF7267B0B10C,34
3,7080C466551C774BEA47C1EA6E61A9A90D,34
4,7182C567571C7A4AE945BFEC6F68A9AC0D,34
...,...,...
6318,56D91398B3C5BAFE05D7906C58775F19F1,34
6319,55DB1397B3C4B9FE04D78F6D56765F18F5,34
6320,53DC149AB3C8BDFF04DA936C54786118F7,34
6321,53DC15AAB3CCBEFF05D7926C55786018F8,34


In [4]:
numPositions = 10

entropy_chunk = 2
start = 6000
step = 252
##############################################################

inputVec = list(df_set['string'].iloc[start : start + step])
returnPositions = []
returnEntropies = []
maxLengthString = df_set['length'].max()
#print(max_length)

greedyRenyiSelectorStopK(maxLengthString, inputVec, inputVec, returnPositions, returnEntropies, numPositions)

print("normal")
print([int(x * entropy_chunk/2) for x in returnPositions])
print(returnEntropies)
print("------")

##########################################################

# inputVec = list(df_array_T)
# returnPositions = []
# returnEntropies = []
# maxLengthString = len(inputVec[0])

# greedyRenyiSelectorStopK(maxLengthString, inputVec, inputVec, returnPositions, returnEntropies, numPositions)

# print("transpose")
# print(returnPositions)
# print(returnEntropies)
# print("------")

normal
[14, 6, 1, 0, 2, 3, 4, 5, 7, 8]
[6.047956669469941, 10.778898476008377, inf, inf, inf, inf, inf, inf, inf, inf]
------
