Skip to content

Commit

Permalink
Fix visualcurrent#23: Cover full Unicode table
Browse files Browse the repository at this point in the history
  • Loading branch information
omegakid1902 committed Aug 29, 2021
1 parent 223dd72 commit 5bf81e2
Showing 1 changed file with 23 additions and 32 deletions.
55 changes: 23 additions & 32 deletions N2Omodule.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,24 @@
from csv import DictReader
from pathlib import Path

def notion_mix_decoded_string_convert(string):

regexSpecialUtf8 = compile("%([A-F0-9][A-F0-9])")
encoded_str = []

while range(len(string)):

special_utf8_match = regexSpecialUtf8.search(string[0:3])
if (special_utf8_match):
code = regexSpecialUtf8.sub("0x"+string[1:3], string[0:3])
encoded_str.append(bytes([int(code, 0)]))
string = string[3:]
else:
encoded_str.append(string[0].encode('utf-8'))
string = string[1:]

return b''.join(encoded_str).decode('utf-8')


def str_slash_char_remove(string):

Expand Down Expand Up @@ -116,7 +134,7 @@ def N2Ocsv(csvFile):
line = line.rstrip()
#1 Replace URL identifiers and/or symbols with a space
line = regexURLid.sub(" ",line)
line = regexSymbols.sub(" ",line)
line = regexSymbols.sub("",line)
#2 Remove duplicate spaces
line = regexSpaces.sub(" ", line)
#3 Remove any spaces at beginning
Expand Down Expand Up @@ -208,35 +226,10 @@ def embedded_link_convert(line):
relativePath = str_space_utf8_replace(relativePath)

utf8_match = regexutf8.search(relativePath)
while utf8_match:
is_special_utf8 = False
utf8_match = regexutf8.search(relativePath)
if utf8_match:
byte_1 = "0x" + utf8_match.group(1)
byte_2 = "0x" + utf8_match.group(2)

if (byte_1[0:3] == "0xE") and (byte_1[3] in ['1', '2', '3', '4', '5', '6']):

special_utf8_match = regexSpecialUtf8.search(relativePath)
byte_3 = "0x" + special_utf8_match.group(3)
bytes_unicode = bytes([int(byte_1,0), int(byte_2,0), int(byte_3,0)])
is_special_utf8 = True
else:
bytes_unicode = bytes([int(byte_1,0), int(byte_2,0)])

try:
unicode_str = str(bytes_unicode, 'utf-8')
except:
print("ERROR: convert unicode failed")
print(f" {bytes_unicode} in - {line}")
break

if is_special_utf8:
relativePath = regexSpecialUtf8.sub(unicode_str, relativePath, 1)
else:
relativePath = regexutf8.sub(unicode_str, relativePath, 1)

line, num_matchs = regexPath.subn("[["+relativePath+"]]", line)
if utf8_match:
relativePath = notion_mix_decoded_string_convert(relativePath)

line, num_matchs = regexPath.subn("![["+relativePath+"]]", line)

if num_matchs > 1:
print(f"Warning: {line} replaced {num_matchs} matchs!!")
Expand All @@ -260,7 +253,6 @@ def internal_link_convert(line):
regexRelativePathNotion = compile("https:\/\/www\.notion\.so")
regexRelativePathMdCsv = compile("(?:\.md|\.csv)")
regexRelativePathImage = compile("(?:\.png|\.jpg|\.gif|\.bmp|\.jpeg|\.svg)")
regexSlash = compile("\/")

num_matchs = 0
# Identify and group relative paths
Expand All @@ -272,7 +264,6 @@ def internal_link_convert(line):
relativePath = pathMatch.group(2)
notionMatch = regexRelativePathNotion.search(relativePath)
is_md_or_csv = regexRelativePathMdCsv.search(relativePath)
is_image = regexRelativePathImage.search(relativePath)

if is_md_or_csv or notionMatch:
# Replace all matchs
Expand Down

1 comment on commit 5bf81e2

@ooker777
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I use this file instead but the result still contain URL encoding:

# Điều phối mạng lưới

Ban: ../Ban%209e3ad/Pha%CC%81t%20trie%200cef1.md
Hạng mục hành động: ../../Va%CC%82%CC%81n%20%C4%91e%CC%82%CC%80%20d9ec7/Ha%CC%A3ng%20mu%CC%A3c%2018231/Giu%CC%9B%CC%83%20tu%CC%9Bo%20f1036.md, ../../Va%CC%82%CC%81n%20%C4%91e%CC%82%CC%80%20d9ec7/Ha%CC%A3ng%20mu%CC%A3c%2018231/%C4%90a%CC%81nh%20gia%CC%81%20d8d82.md, ../../Va%CC%82%CC%81n%20%C4%91e%CC%82%CC%80%20d9ec7/Ha%CC%A3ng%20mu%CC%A3c%2018231/Pho%CC%89ng%20va%CC%82%209224a.md, ../../Va%CC%82%CC%81n%20%C4%91e%CC%82%CC%80%20d9ec7/Ha%CC%A3ng%20mu%CC%A3c%2018231/Ta%CC%A3o%20tra%CC%86%CC%81%20d78dd.md, ../../Va%CC%82%CC%81n%20%C4%91e%CC%82%CC%80%20d9ec7/Ha%CC%A3ng%20mu%CC%A3c%2018231/To%CC%81m%20ta%CC%86%CC%81c%20bd133.md, ../../Va%CC%82%CC%81n%20%C4%91e%CC%82%CC%80%20d9ec7/Ha%CC%A3ng%20mu%CC%A3c%2018231/To%CC%82%CC%89%20chu%CC%9B%CC%81%20d96c1.md, ../../Va%CC%82%CC%81n%20%C4%91e%CC%82%CC%80%20d9ec7/Ha%CC%A3ng%20mu%CC%A3c%2018231/%C4%90a%CC%A3%CC%86t%20ca%CC%81c%2055a34.md
Nhiệm vụ cụ thể: ../../Nhie%CC%A3%CC%82m%20vu%20cacfd/Nhie%CC%A3%CC%82m%20vu%20abf11/Khuye%CC%82%CC%81n%20k%2059edc.md
Nhân sự: ../../Danh%20sa%CC%81ch%20168a9/Tha%CC%80nh%20vie%20a6250/Ly%CC%81%20Minh%20N%20e9f20.md

Please sign in to comment.