# Process protein sequences

# Summary process-protein 6.14.2024
1. read from gz
1. convert to non-fasta format: firstline definition, secondline aa
1. hashes
1. save labelMap and labelSeqHash as gz
1. find duplicates and count them
    1. convert to a set() to include only unique seqHash
    1. convert set to a list of unique seqHash to give index order
    1. count duplicates and add to a list of numOfDups same length as the unique seqHash
    1. loop original labelseqhash to read unique seqHash list in parallel to numOfDups list to populate it based on same index.
    1. create a list of dics (newList) where each dictionary element is of type{'XXXXXX' : 8} key/seqHash and value/number-of-duplicates
    1. Test by running def findDuplicates(obj, item) : the obj is result[1] containing labelhash and seqHash k/v pairs 
    1. and
    1. findDup(obj, item) obj is newList and uses dic.get(item) to find the key/value pair which is the seqHash/numberof duplicates. 
1. Sorting PENDING

# GLOBAL

In [10]:
# GLOBALS
import sys
import json
import gzip
from hashlib import sha256
import os
from dotenv import load_dotenv

fastaProtein = [ {'label': '>NP_000005.3 alpha-2-macroglobulin isoform a precursor [Homo sapiens]', 'seq': 'MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEKGCVLLSYLNETVTVSASLESVRGNRSLFTDLEAENDVLHCVAFAVPKSSSNEEVMFLTVQVKGPTQEFKKRTTVMVKNEDSLVFVQTDKSIYKPGQTVKFRVVSMDENFHPLNELIPLVYIQDPKGNRIAQWQSFQLEGGLKQFSFPLSSEPFQGSYKVVVQKKSGGRTEHPFTVEEFVLPKFEVQVTVPKIITILEEEMNVSVCGLYTYGKPVPGHVTVSICRKYSDASDCHGEDSQAFCEKFSGQLNSHGCFYQQVKTKVFQLKRKEYEMKLHTEAQIQEEGTVVELTGRQSSEITRTITKLSFVKVDSHFRQGIPFFGQVRLVDGKGVPIPNKVIFIRGNEANYYSNATTDEHGLVQFSINTTNVMGTSLTVRVNYKDRSPCYGYQWVSEEHEEAHHTAYLVFSPSKSFVHLEPMSHELPCGHTQTVQAHYILNGGTLLGLKKLSFYYLIMAKGGIVRTGTHGLLVKQEDMKGHFSISIPVKSDIAPVARLLIYAVLPTGDVIGDSAKYDVENCLANKVDLSFSPSQSLPASHAHLRVTAAPQSVCALRAVDQSVLLMKPDAELSASSVYNLLPEKDLTGFPGPLNDQDNEDCINRHNVYINGITYTPVSSTNEKDMYSFLEDMGLKAFTNSKIRKPKMCPQLQQYEMHGPEGLRVGFYESDVMGRGHARLVHVEEPHTETVRKYFPETWIWDLVVVNSAGVAEVGVTVPDTITEWKAGAFCLSEDAGLGISSTASLRAFQPFFVELTMPYSVIRGEAFTLKATVLNYLPKCIRVSVQLEASPAFLAVPVEKEQAPHCICANGRQTVSWAVTPKSLGNVNFTVSAEALESQELCGTEVPSVPEHGRKDTVIKPLLVEPEGLEKETTFNSLLCPSGGEVSEELSLKLPPNVVEESARASVSVLGDILGSAMQNTQNLLQMPYGCGEQNMVLFAPNIYVLDYLNETQQLTPEIKSKAIGYLNTGYQRQLNYKHYDGSYSTFGERYGRNQGNTWLTAFVLKTFAQARAYIFIDEAHITQALIWLSQRQKDNGCFRSSGSLLNNAIKGGVEDEVTLSAYITIALLEIPLTVTHPVVRNALFCLESAWKTAQEGDHGSHVYTKALLAYAFALAGNQDKRKEVLKSLNEEAVKKDNSVHWERPQKPKAPVGHFYEPQAPSAEVEMTSYVLLAYLTAQPAPTSEDLTSATNIVKWITKQQNAQGGFSSTQDTVVALHALSKYGAATFTRTGKAAQVTIQSSGTFSSKFQVDNNNRLLLQQVSLPELPGEYSMKVTGEGCVYLQTSLKYNILPEKEEFPFALGVQTLPQTCDEPKAHTSFQISLSVSYTGSRSASNMAIVDVKMVSGFIPLKPTVKMLERSNHVSRTEVSSNHVLIYLDKVSNQTLSLFFTVLQDVPVRDLKPAIVKVYDYYETDEFAIAEYNAPCSKDLGNA'}, {'label': '>NP_000006.2 arylamine N-acetyltransferase 2 [Homo sapiens]', 'seq': 'MDIEAYFERIGYKNSRNKLDLETLTDILEHQIRAVPFENLNMHCGQAMELGLEAIFDHIVRRNRGGWCLQVNQLLYWALTTIGFQTTMLGGYFYIPPVNKYSTGMVHLLLQVTIDGRNYIVDAGSGSSSQMWQPLELISGKDQPQVPCIFCLTEERGIWYLDQIRREQYITNKEFLNSHLLPKKKHQKIYLFTLEPRTIEDFESMNTYLQTSPTSSFITTSFCSLQTPEGVYCLVGFILTYRKFNYKDNTDLVEFKTLTEEEVEEVLRNIFKISLGRNLVPKPGDGSLTI'}, {'label': '>YP_003024037.1 NADH dehydrogenase subunit 6 (mitochondrion) [Homo sapiens]', 'seq': 'MMYALFLLSVGLVMGFVGFSSKPSPIYGGLVLIVSGVVGCVIILNFGGGYMGLMVFLIYLGGMMVVFGYTTAMAIEEYPEAWGSGVEVLVSVLVGLAMEVGLVLWVKEYDGVVVVVNFNSVGSWMIYEGEGSGLIREDPIGAGALYDYGRWLVVVTGWTLFVGVYIVIEIARGN'}, {'label': '>YP_003024038.1 cytochrome b (mitochondrion) [Homo sapiens]', 'seq': 'MTPMRKTNPLMKLINHSFIDLPTPSNISAWWNFGSLLGACLILQITTGLFLAMHYSPDASTAFSSIAHITRDVNYGWIIRYLHANGASMFFICLFLHIGRGLYYGSFLYSETWNIGIILLLATMATAFMGYVLPWGQMSFWGATVITNLLSAIPYIGTDLVQWIWGGYSVDSPTLTRFFTFHFILPFIIAALATLHLLFLHETGSNNPLGITSHSDKITFHPYYTIKDALGLLLFLLSLMTLTLFSPDLLGDPDNYTLANPLNTPPHIKPEWYFLFAYTILRSVPNKLGGVLALLLSILILAMIPILHMSKQQSMMFRPLSQSLYWLLAADLLILTWIGGQPVSYPFTIIGQVASVLYFTTILILMPTISLIENKMLKWA'}]
inputPath = 'C:\\Users\\creeperpandatrex\\Documents\\1000genomes\\DATA\\38.p14\\'
inputFile = 'GCF_000001405.40_GRCh38.p14_protein.faa.gz'

# print(os.getenv('SEED'))

# Definition/function

In [3]:
def fasta2one (myPath):
    with myPath as file:
        countAngle = 0
        countNoAngle = 0
        fastaArrDics = []
        currObj = {"label":'', "seq":''}
        prevObj = {"label":'', "seq":''}
        currSeq = ""
        for line in file:
            if line[0]  == '>':
                countAngle += 1
                currSeq = ""
                prevObj = currObj
                currObj = {"label":'', "seq":''}
                currObj["label"] = line.strip()
                if countAngle != 1:
                    fastaArrDics.append(prevObj)
            else:
                countNoAngle += 1
                currSeq += line.strip()
                currObj.update({"seq": currSeq}) 
        fastaArrDics.append(currObj)
        # return [len(fastaArrDics), prevObj, currObj, countNoAngle + countAngle]
        # return [len(fastaArrDics), fastaArrDics[0], fastaArrDics[1], fastaArrDics[-2], fastaArrDics[-1]]
        return fastaArrDics

# globalDictionary = fasta2one(gzip.open(inputFile, 'rt'))

# Save protein in non fasta format: two lines, definition/aminoacid sequence

In [8]:
def print2file (myPath, result):
    out = open(myPath, 'w')
    out.write(result) # it must be a string

globalDictionary = fasta2one(gzip.open(inputPath + inputFile, 'rt'))

print2file(inputPath + inputFile + '.nonfasta', json.dumps(globalDictionary))

# Hashing

In [None]:
# Test
def dict2hash (obj):
    listDicLabelHash = []
    listDicLabelHashSeqHash = []
    for item in obj:
        labelHash = sha256((item['label'] + os.getenv('SEED')).encode()).hexdigest()
        seqHash = sha256((item['seq'] + os.getenv('SEED')).encode()).hexdigest()
        print(item['label'] + '\t' + labelHash)
        print(labelHash + '\t' + seqHash)
        

dict2hash(fastaProtein)

In [17]:
# All data
def dict2hash (obj):
    listDicLabelHash = []
    listDicLabelHashSeqHash = []
    for item in obj:
        labelHash = sha256((item['label'] + os.getenv('SEED')).encode()).hexdigest()
        seqHash = sha256((item['seq'] + os.getenv('SEED')).encode()).hexdigest()
        # print(item['label'] + '\t' + labelHash)
        # print(labelHash + '\t' + seqHash)
        labelMap = {'label': item['label'], 'labelHash': labelHash}
        currObj = {'labelHash': labelHash, 'seqHash': seqHash}
        listDicLabelHash.append(labelMap)
        listDicLabelHashSeqHash.append(currObj)
    return [listDicLabelHash, listDicLabelHashSeqHash]

# dict2hash(fastaProtein)
    
print(inputPath)
print(inputFile)

print2file(inputPath + inputFile + ".labelmap", json.dumps(dict2hash(globalDictionary)[0]))
print2file(inputPath + inputFile + ".labelseqhash", json.dumps(dict2hash(globalDictionary)[1]))

result = dict2hash(globalDictionary)
print(len(result[0]))
print(len(result[1]))

C:\Users\creeperpandatrex\Documents\1000genomes\DATA\38.p14\
GCF_000001405.40_GRCh38.p14_protein.faa.gz
136194
136194


## Save to file after compressing
1. https://docs.python.org/3/library/gzip.html#examples-of-usage
1. https://stackoverflow.com/questions/66250471/compresing-large-files-with-gzip-in-python

## Compressing options: compress a list of dictionaries, compress a saved file

### result : it contains an array of 2 elements: labelmap and labelseqhash

```
BLOCK_SIZE = 8192

with open(myfile, "rb") as f_in, gzip.open(output_file, 'wb') as f_out:
    while True:
        content = f_in.read(BLOCK_SIZE)
        if not content:
            break
        f_out.write(content)

import gzip
with gzip.open('/home/joe/file.txt.gz', 'rb') as f:
    file_content = f.read()

import gzip
content = b"Lots of content here"
with gzip.open('/home/joe/file.txt.gz', 'wb') as f:
    f.write(content)


import gzip
import shutil
with open('/home/joe/file.txt', 'rb') as f_in:
    with gzip.open('/home/joe/file.txt.gz', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

Example of how to GZIP compress a binary string:

import gzip
s_in = b"Lots of content here"
s_out = gzip.compress(s_in)

```

In [23]:
with gzip.open(inputPath+inputFile+'.WB', 'wb') as f:
    # f.write(json.dumps(result[0]))
    f.write((json.dumps(result[0])).encode())

with gzip.open(inputPath+inputFile+'.NOB', 'w') as f:
    # f.write(json.dumps(result[0])) # list object is not a str
    f.write((json.dumps(result[0])).encode()) # no decode() since it is a str already

#### creeperpandatrex@creepypandatrex MINGW64 ~/Documents/1000genomes/DATA/38.p14

1. $ diff GCF_000001405.40_GRCh38.p14_protein.faa.WB GCF_000001405.40_GRCh38.p14_protein.faa.NOB
1. Binary files GCF_000001405.40_GRCh38.p14_protein.faa.WB and GCF_000001405.40_GRCh38.p14_protein.faa.NOB  ```differ```


## Save labelMap and labelSeqHash in compressed form

In [24]:
# 
# print2file(inputPath + inputFile + ".labelmap", json.dumps(dict2hash(globalDictionary)[0]))
# print2file(inputPath + inputFile + ".labelseqhash", json.dumps(dict2hash(globalDictionary)[1]))

def print2gz (myPath, result, addExt):
   with gzip.open(myPath + addExt, 'w') as f:
    f.write((json.dumps(result)).encode()) # no decode() since it is a str already

print2gz(inputPath + inputFile, result[0], '.labelmap.gz')
print2gz(inputPath + inputFile, result[1], '.labelseqhash.gz')

## In labelseqhash.gz
1. Find sequence duplicates and count them.

## Do statistics and save unique sequences into a file: 
1. One unique sequence to many labels/names with > char

In [None]:
#          labelmap    labeseqhash
print(len(result[0]), len(result[1]))

labelSeqHashPartial = [
     {"labelHash": "87eecab07dd6bbe71a3205767544365c513b6a9a2e5492c066582ccc4f7b0f58", "seqHash": "df32ee41d17e99fbae8a7b983e4a0f75cc288f2c4aca5d9d783a7598212c2546"}, {"labelHash": "f3fd5d36fc2cc34c02fa81d61a7bea5c02f2eb9ab02f2d0fd0a0fe09cb56f9e4", "seqHash": "8889a919f2537b3c7e87d36ef6baf37f5eca22e6e2b29bb6e2d017aced2a2777"}, {"labelHash": "b7554290ad15c0b939e7cef993be2bd8e845763fdd31ca14cb9d4d55ea7f1587", "seqHash": "c6a35324b6fe560f3e1bc1c4df4aaebf9d53f8c0c034da0e7f98a9041d31b5dc"}, {"labelHash": "fc588a8606e3ecb6c848f06a0aa8db87bdf2f8522c73993a34fc7c81f449f112", "seqHash": "45075f7ccf617e0af6d435f1799c003ddaa7cc915683a265958551a1a519b262"}, {"labelHash": "493e573302b81fef363a1e4f01de0232f89139ed8da63df5f31d695d15c1c8dd", "seqHash": "45075f7ccf617e0af6d435f1799c003ddaa7cc915683a265958551a1a519b262"}, {"labelHash": "aa488e6c8dd16fd769848b36fb65284960573f0ae4023cf17cfd79e0826cd4df", "seqHash": "f897dd2f861cba0dd3f9a5215c0b3a4f6efae55b9ec74f93bffc8f418b1dead2"}
]

item = '8889a919f2537b3c7e87d36ef6baf37f5eca22e6e2b29bb6e2d017aced2a2777'
item2 = '87eecab07dd6bbe71a3205767544365c513b6a9a2e5492c066582ccc4f7b0f58'




for i in result[1]:
    if item == i['seqHash']:
        print(i)
    


In [None]:
#          labelmap    labelseqhash
print(len(result[0]), len(result[1]))

labelSeqHashPartial = [
     {"labelHash": "87eecab07dd6bbe71a3205767544365c513b6a9a2e5492c066582ccc4f7b0f58", "seqHash": "df32ee41d17e99fbae8a7b983e4a0f75cc288f2c4aca5d9d783a7598212c2546"}, {"labelHash": "f3fd5d36fc2cc34c02fa81d61a7bea5c02f2eb9ab02f2d0fd0a0fe09cb56f9e4", "seqHash": "8889a919f2537b3c7e87d36ef6baf37f5eca22e6e2b29bb6e2d017aced2a2777"}, {"labelHash": "b7554290ad15c0b939e7cef993be2bd8e845763fdd31ca14cb9d4d55ea7f1587", "seqHash": "c6a35324b6fe560f3e1bc1c4df4aaebf9d53f8c0c034da0e7f98a9041d31b5dc"}, {"labelHash": "fc588a8606e3ecb6c848f06a0aa8db87bdf2f8522c73993a34fc7c81f449f112", "seqHash": "45075f7ccf617e0af6d435f1799c003ddaa7cc915683a265958551a1a519b262"}, {"labelHash": "493e573302b81fef363a1e4f01de0232f89139ed8da63df5f31d695d15c1c8dd", "seqHash": "45075f7ccf617e0af6d435f1799c003ddaa7cc915683a265958551a1a519b262"}, {"labelHash": "aa488e6c8dd16fd769848b36fb65284960573f0ae4023cf17cfd79e0826cd4df", "seqHash": "f897dd2f861cba0dd3f9a5215c0b3a4f6efae55b9ec74f93bffc8f418b1dead2"}
]

item = '8889a919f2537b3c7e87d36ef6baf37f5eca22e6e2b29bb6e2d017aced2a2777'


# for i in result[1]:
#     if item == i['seqHash']:
#         print(i)
    
# Set unordered, no duplicates
mySetUniqueSeqHash = set()
for i in result[1]:
    mySetUniqueSeqHash.add(i['seqHash'])
print(len(mySetUniqueSeqHash)) # 89832
print(item in mySetUniqueSeqHash) # True

# build a list of dics with key:seqHash value:[one, two, three, etc labels]

listOfUnique = list(mySetUniqueSeqHash) # 89832
print(len(listOfUnique))


In [63]:
def convert(mySet):
    return list(map(lambda x: x, mySet))

# Driver function
listOfUniqueMap = convert(mySetUniqueSeqHash)
print(len(listOfUniqueMap))

89832


In [64]:
# unique seqHash as a list.
# for each item in result[1]:labelHash and seqHash, 
# unique seqHash from list to dictionary
listOfDicsOfDups = []
for i in listOfUnique:
    dicsOfDups = {}
    dicsOfDups.update({i : []})
    listOfDicsOfDups.append(dicsOfDups)

print(len(listOfDicsOfDups))

89832


In [None]:
# now append labels and/or labelhash to dics value array
def findIndex(item):
    myIdx = listOfUnique.index(item)
    # print(myIdx)
    return myIdx

print(listOfUnique.index('df32ee41d17e99fbae8a7b983e4a0f75cc288f2c4aca5d9d783a7598212c2546'))
# position 40220
numofDups = [0]*len(listOfUnique)
counter = 0
for i in result[1]:
    findIdx = findIndex(i['seqHash'])
    counter += 1
    print('--------------- ', counter)
    prevValue = numofDups[findIdx]
    numofDups[findIdx] = prevValue + 1
    prevValue = 0
    
        

In [112]:
print(len(numofDups))
print(len(listOfDicsOfDups)) # [{i:[]},......]

print(numofDups[0])
print(listOfDicsOfDups[0])
print(listOfDicsOfDups[0].get('7930b7752c9b48d6cf08d8fd011db75fe8f5e750121a058a632ce3f9df44d5c0'))




89832
89832
1
{'7930b7752c9b48d6cf08d8fd011db75fe8f5e750121a058a632ce3f9df44d5c0': []}
[]


# newList: key: seqHash, value: number of duplicates
## algorithm from process-protein ipynb
1. cell 61: from result[0] labelMap result[1] labelseqHash
1. from result[1] loop dictionary['seqHash'] add to a set
1. set() does not accept duplicates, but it is unordered
1. mySetUniqueSeqHash set 89832
1. convert the set to list by list(mySet) to provide an index order
1. listOfUnique 89832
1. cell 64: create a list of dics duplicated as tuple('seqHash': 'xxxx', 'numOfDups': 19)
1. or
1. as {'XXXXXX': 19}
1. or
1. as {'XXXXX" : []} to be able to add multiple label names hashes
1. I decided for {'XXXXX': 19}
1. listOfDicsOfDups
1. cell 65: create a parallel list same length as listOfUnique and populate with zeros or None's otherwise it will complaint about adding to index out of bounds
1. count number of duplicates by looping result[1] and adding one at the findIndex(i['seqHash']) in numOfDups list
1. Finally, parallel loop listOfDicsOfDups and numOfDups to create a dic element type {'XXXXXXX': 3} total newList 89832
1. Pending to confirm the correct mapping of seqHash to number of duplicates

In [120]:
counter = 0
newDic = {}
newList = []
for i in listOfDicsOfDups: # {'dsdf':2}
    obj = {}
    myNum = numofDups[counter]
    seqHash = list(i.keys())[0]
    newDic.update({seqHash:myNum})
    newList.append(newDic)
    counter += 1
    newDic = {}

print(len(newList))
print(newList[0])

89832
{'7930b7752c9b48d6cf08d8fd011db75fe8f5e750121a058a632ce3f9df44d5c0': 1}


In [None]:
print(newList[0])
print(newList[1])
print(newList[2])
print(newList[3])
print(newList[34])

print(newList[0])
print(newList[1])
print(newList[2])
print(newList[3])
print(newList[34])
### output
{'7930b7752c9b48d6cf08d8fd011db75fe8f5e750121a058a632ce3f9df44d5c0': 1}
{'b6d31d96c81febb887c419a0bc15361e2f4d426388110814d2eaab042834cc87': 1}
{'bbcefaf6463086437b6d2be1d8abf4d814386c2d0aa15f479c956002ca14d135': 1}
{'90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb': 2}
{'eb28c8a63f6d3dac50ad1ce8b0f0b56fb2081e41ebe4a5b7a20212041b33c6a3': 1}


# algorithm from process-protein ipynb
1. cell 61: from result[0] labelMap result[1] labelseqHash
1. from result[1] loop dictionary['seqHash'] add to a set
1. set() does not accept duplicates, but it is unordered
1. mySetUniqueSeqHash set 89832
1. convert the set to list by list(mySet) to provide an index order
1. listOfUnique 89832
1. cell 64: create a list of dics duplicated as tuple('seqHash': 'xxxx', 'numOfDups': 19)
1. or
1. as {'XXXXXX': 19}
1. or
1. as {'XXXXX" : []} to be able to add multiple label names hashes
1. I decided for {'XXXXX': 19}
1. listOfDicsOfDups
1. cell 65: create a parallel list same length as listOfUnique and populate with zeros or None's otherwise it will complaint about adding to index out of bounds
1. count number of duplicates by looping result[1] and adding one at the findIndex(i['seqHash']) in numOfDups list
1. Finally, parallel loop listOfDicsOfDups and numOfDups to create a dic element type {'XXXXXXX': 3} total newList 89832
1. Pending to confirm the correct mapping of seqHash to number of duplicates

In [125]:
def findDuplicates(obj, item):
    for i in obj:
     if item == i['seqHash']:
        print(i)
def findDup(obj, item):
   for i in obj:
      if i.get(item):
         print(i)
         
findDuplicates(result[1], '90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb')

print(findDup(newList, '90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb'))


{'labelHash': 'bfba98cd8d86c34fc125975957ce1e6faeb612c76942eb6c34c19b00e670d168', 'seqHash': '90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb'}
{'labelHash': '75d470abc1a64d03778514694fa7792f4e1de60d7f0bf34e9bd190ff3b28f0e2', 'seqHash': '90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb'}
{'90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb': 2}
None


## output
{'labelHash': 'bfba98cd8d86c34fc125975957ce1e6faeb612c76942eb6c34c19b00e670d168', 'seqHash': '90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb'}
{'labelHash': '75d470abc1a64d03778514694fa7792f4e1de60d7f0bf34e9bd190ff3b28f0e2', 'seqHash': '90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb'}
{'90e6b2a36ee7c3571d973fc78e49101eaf468cdff06cacecd31c53f015e277cb': 2}
None


# Sorting

In [133]:
print(newList[0])

# my_list = [{'name':'Homer', 'age':39}, {'name':'Bart', 'age':10}]
# my_list.sort(lambda x,y : cmp(x['name'], y['name']))

# my_list = sorted(my_list, key=lambda k: k['name'])
# key=lambda k: list(k.value())[0]
# newlist = sorted(list_to_be_sorted, key=lambda d: d['name'])

mySorted = sorted(newList, key=lambda k: list(k.values())[0] )
print(len(mySorted))

print(mySorted[-1])
print(mySorted[-2])
print(mySorted[-3])
print(mySorted[-4])
print(mySorted[-5])

{'7930b7752c9b48d6cf08d8fd011db75fe8f5e750121a058a632ce3f9df44d5c0': 1}
89832
{'09cfb22627c69fe2a832df7f09fdcb80ddd44ac5875432677b03e54fd1d3e337': 102}
{'3a96db023518af68d4879abed8848f635d5d655e9322d22a37100d32c89457a9': 101}
{'41013e8a64849f25582af88ac47befcda56d5f1c46b6332f1400203f252ca3f5': 71}
{'282fdbd8bf771ecef24ea29fe94f3cbe90f1f038891100bc44c7b84510488673': 62}
{'e3ded293d60d16bdf71d3d5bfa526cafe04588ad9c155ebd1bec524a319885ab': 54}
