In [9]:
# read sqlite3
import sqlite3
import anki
from anki.collection import Collection
import re
import os
import zipfile
import termcolor
import tqdm
import csv
import json
import shutil
import pickle
# import pandas as pd
import editdistance

EDICT_PATH = '/Users/AlexG/Documents/GitHub/mindmaps/anki_tools/data/ecdict.csv'
MEDIA_FOLDER = '/Users/AlexG/Library/Application Support/Anki2/User 1/collection.media'
PKG_PATH = '/Users/AlexG/Downloads/MyPaperNotes.apkg'
PKG_PATH = '/Users/AlexG/Downloads/ivl10.apkg'
PKG_PATH = '/Users/AlexG/Downloads/极品GRE红宝书.apkg'
DATA_PATH = '/Users/AlexG/Documents/GitHub/mindmaps/anki_tools/data'
outdir = PKG_PATH + '-decompress'
# outdir = '/Users/AlexG/Downloads/ivl10-decompress'
# ANKI2_PATH = '/Users/AlexG/Downloads/ivl10-decompress/collection.anki2'
ANKI21_PATH = outdir + '/collection.anki21'
# EXPORT_PATH_WEBSITE = '/Users/AlexG/Documents/GitHub/typingpractices/website/onepager/onepager/templates/onepager/cards.html'
EXPORT_PATH_WEBSITE = '/Users/AlexG/Documents/GitHub/anki_html/docs/gre_cards.html'

def unzip_anki(pkg_file_path, outdir):
    # Create the extract directory if it doesn't exist
    os.makedirs(outdir, exist_ok=True)

    # Unzip the .apkg file
    with zipfile.ZipFile(PKG_PATH, 'r') as zip_ref:
        zip_ref.extractall(outdir)

    print(f"Extracted files to {outdir}")

class EditDistanceFinder(object):
    def __init__(self, words):
        self.words = words
        self.load_cache()

    def load_cache(self):
        cache_path = os.path.join(DATA_PATH, 'edit_distance_cache.pkl')
        if os.path.exists(cache_path):
            with open(cache_path, 'rb') as f:
                self.cache = pickle.load(f)
        else:
            self.cache = {}

    def save_cache(self):
        cache_path = os.path.join(DATA_PATH, 'edit_distance_cache.pkl')
        with open(cache_path, 'wb') as f:
            pickle.dump(self.cache, f)

    def find(self, query, 
             edit_distance_lower=1, 
             edit_distance_upper=1,
             html_class="sim1"):
        res = []

        # find similar words with cache
        similar_words = []
        key = (query, edit_distance_lower, edit_distance_upper)
        if key in self.cache:
            # use cache
            similar_words = self.cache[key]
        else:
            for w in self.words:
                if w[0] == query:
                    continue
                d = editdistance.eval(w[0], query)
                if d >= edit_distance_lower and d <= edit_distance_upper:
                    similar_words.append((d, w[0], w[2]))
            # save cache
            self.cache[key] = similar_words
        for w in similar_words:
            text = f'<div class="{html_class}">{w[0]}: {w[2]}</div>'
            res.append(text)
        
        return ' '.join(res)
            
        
class SubStrFinder(object):
    def __init__(self, words):
        self.words = words
        self.load_cache()

    def load_cache(self):
        cache_path = os.path.join(DATA_PATH, 'sub_word_cache.pkl')
        if os.path.exists(cache_path):
            with open(cache_path, 'rb') as f:
                self.cache = pickle.load(f)
        else:
            self.cache = {}

    def save_cache(self):
        cache_path = os.path.join(DATA_PATH, 'sub_word_cache.pkl')
        with open(cache_path, 'wb') as f:
            pickle.dump(self.cache, f)


    def load_edict(self, path):
        total_lines = 0
        with open(path, 'r') as f:
            rd = csv.reader(f)
            for cols in tqdm.tqdm(rd):
                if len(cols[0]) < 4 or len(cols[3]) == 0:
                    continue
                if 'abbr.' in cols[3]:
                    continue
                if cols[0].isupper():
                    continue
                fields = (cols[0], cols[1], cols[3])
                self.words.append(fields)
                total_lines += 1
        print(f"Loaded {total_lines} lines from {path}")

    def find(self, query, html_class="simsub"):
        res = []
        print("Searching for {}...".format(termcolor.colored(query, 'red')))
        similar_words = []
        key = query.lower()
        if key in self.cache:
            similar_words = self.cache[key]
        else:
            for w in self.words:
                if w[0] == query:
                    continue
                if w[0].lower() in query.lower():
                    similar_words.append(w)
            self.cache[key] = similar_words

        for w in similar_words:
            text = f'<div class="{html_class}"><b>{w[0]}({w[1]})</b>: {w[2]}</div>'
            res.append(text)
        
        return ' '.join(res)

class ImageSrcUrls(object):
    def __init__(self):
        self.purge_script_path = '/Users/AlexG/Documents/GitHub/anki_html/run_images.sh'
        self.git_push_script_path = '/Users/AlexG/Documents/GitHub/anki_html/run_upload.sh'
        self.source_image_folder = MEDIA_FOLDER
        self.dest_image_folder = '/Users/AlexG/Documents/GitHub/anki_html/docs/images'
        self.IMG_PAT = re.compile(r'<img\b.*\bsrc="')
        self.SRC_PAT = re.compile(r'<img\b.*\bsrc="([^"]+)"')
        self.srcs = []
        self.exts = set()

    def replace_image_url(self, text):
        res = self.SRC_PAT.search(text)
        if res:
            for s in res.groups():
                self.srcs.append(s)
                ext = os.path.splitext(s)[1]
                self.exts.add(ext)
                print(termcolor.colored(s, 'red'))

        return self.IMG_PAT.sub('<img src="./images/', text)
    
    def purge_images(self):
        os.system(f"bash {self.purge_script_path}") 

    def git_push(self):
        os.system(f"bash {self.git_push_script_path}")
        
    def move_images(self):
        self.purge_images()
        for src in tqdm.tqdm(self.srcs):
            shutil.copyfile(
                os.path.join(self.source_image_folder, src), 
                os.path.join(self.dest_image_folder, src)
            )


def format_cards_to_html(cards, collection, outpath, max_word_length=-1, max_cards=200):
    data = []
    image_src = ImageSrcUrls()
    for cid in tqdm.tqdm(cards):
        card = collection.get_card(cid)
        note = card.note()
        fields = note.fields
        # fields[1] = fields[1][:20]
        # fields = [field if len(field) <= max_text_length else field[:max_text_length] + '...' for field in fields]
        data.append((card.ivl, fields))

    edist_finder = EditDistanceFinder([fields for _, fields in data])
    substr_finder = SubStrFinder([fields for _, fields in data])
    substr_finder.load_edict(EDICT_PATH)
    data.sort()
    data = [v for v in data if max_word_length == -1 or len(v[1][0]) <= max_word_length]
    data = [[image_src.replace_image_url(field) for field in fields] 
            for _, fields in data[:max_cards]]
    data = [['<div>{}</div><div>{}</div>'.format(fields[0], fields[1]), 
             '<div class="textscroll">{} {} {}</div>'.format(
                edist_finder.find(fields[0]),
                edist_finder.find(fields[0],
                                  edit_distance_lower=2, 
                                  edit_distance_upper=2,
                                  html_class="sim2"),
                substr_finder.find(fields[0]),
             ),
             fields[2], 
             ] for fields in tqdm.tqdm(data)]
    # save cache
    edist_finder.save_cache()
    substr_finder.save_cache()
    
    print(image_src.exts)
    print(termcolor.colored('Data samples:', 'blue'))
    print('\n'.join(['{}'.format(v) for v in data[:10]]))
    image_src.move_images()
    # with open(outpath+'.csv', 'w') as f:
    #     wt = csv.writer(f)
    #     wt.writerows([v[:3] for v in data])
    rows = ['<tr><td class="indextd">{}</td><td>{}</td></tr>\n'.format(fi, '</td><td class="fixwtd">'.join(field)) for fi, field in enumerate(data)]
    html = f'''
    <head>
        <style>
            body {{
                background: #202020;
            }}

            table {{
                background-color: #A0A0A0;
                --color: #d0d0f5;
                margin: 3em;
            }}

            thead,
            tfoot {{
                background: var(--color);
            }}

            tbody tr:nth-child(even) {{
                background: color-mix(in srgb, var(--color), transparent 60%);
            }}
            div, p, td {{
                width: 200px;
                word-wrap: break-word;
                white-space: normal;
            }}
            td img {{
                max-height: 8em;
            }}
            .fixwtd {{
                width: 200px;
                word-wrap: break-word;
                white-space: normal;
            }}
            .indextd {{
                width: 10px;
                word-wrap: break-word;
                white-space: normal;
            }}
            td {{
                border: 1px ridge #333333;
            }}
            img {{
                max-height: 5em;
                max-width: 5em;
            }}
            .sim1 {{
                width: 26em;
                background-color: #eadcb7;
            }}
            .sim2 {{
                width: 26em;
                background-color: #e99a06;
            }}
            .simsub {{
                width: 26em;
                background-color: #afcd95;
            }}
            b {{
                background-color: #f1c40f;
            }}
            .textscroll {{
                height: 3em;
                width: 27em;
                border: 1px solid #ccc;
                overflow: auto;
            }}
        </style>
        <link rel="stylesheet" href="css/fancy-button.css">
    </head>
    <body>
        <button onclick="changeWidth(0.5)" class="button-74">Half Column Width</button>
        <button onclick="hideRows(10)" class="button-73 fix-width-button">Hide Rows</button>
        <table id="cards">
          {"".join(rows)}
        </table>
        <script src="js/change-width.js"></script>
        <script src="js/hide-rows.js"></script>
    </body>
    '''
    with open(outpath, 'w') as f:
        f.write(html)

    # git push
    image_src.git_push()

unzip_anki(PKG_PATH, outdir)
col = Collection(ANKI21_PATH)
# Fetch all decks in the collection
decks = col.decks.all()
print("Decks in collection:")
for deck in decks:
    print(f"Deck ID: {deck['id']}, Name: {deck['name']}")

try:
    print(dir(col))
    print('total card count: ', col.card_count())
    # print('note count: ', col.node_count())

    # query = 'prop:ivl<=30'
    query = ''
    cards = col.find_cards(query)
    print(termcolor.colored(f'query {query} find {len(cards)} cards', 'green'))

    outpath = os.path.join(outdir, 'cards.html')
    # format_cards_to_html(cards, col, outpath)
    format_cards_to_html(cards, col, EXPORT_PATH_WEBSITE,
                         max_word_length=-1)

    # Fetch all notes (cards content) in the collection
    # print("\nNotes in collection:")
    # for note_id in col.db.list("select id from notes"):
    #     note = col.get_note(note_id)
    #     fields = note.fields
    #     print(f"Note ID: {note_id}, Fields: {fields}")

    # # Fetch all cards (metadata) in the collection
    # print("\nCards in collection:")
    # for card_id in col.db.list("select id from cards"):
    #     card = col.get_card(card_id)
    #     print(f"Card ID: {card.id}, Note ID: {card.nid}, Deck ID: {card.did}, Type: {card.type}")

    
except Exception as e:
    print(f"Error: {e}")
finally:
    # Close the collection
    col.close()

col.close()

Extracted files to /Users/AlexG/Downloads/极品GRE红宝书.apkg-decompress
blocked main thread for 409ms:
  File "/usr/local/Cellar/python@3.9/3.9.5/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/usr/local/Cellar/python@3.9/3.9.5/Frameworks/Python.framework/Versions/3.9/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/AlexG/Documents/GitHub/mindmaps/devenv/lib/python3.9/site-packages/ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "/Users/AlexG/Documents/GitHub/mindmaps/devenv/lib/python3.9/site-packages/traitlets/config/application.py", line 1075, in launch_instance
    app.start()
  File "/Users/AlexG/Documents/GitHub/mindmaps/devenv/lib/python3.9/site-packages/ipykernel/kernelapp.py", line 739, in start
    self.io_loop.start()
  File "/Users/AlexG/Documents/GitHub/mindmaps/devenv/lib/python3.9/site-packages/to

100%|██████████| 7513/7513 [00:01<00:00, 4832.34it/s]
770612it [00:04, 155693.98it/s]


Loaded 750698 lines from /Users/AlexG/Documents/GitHub/mindmaps/anki_tools/data/ecdict.csv
[31mimages-5186a2c0d72bb6ced6ab1099107be5415e7b0709.jpg[0m
[31mGuinea-asks-bauxite-miners-to-present-local-refinery-plans-by-May.jpeg[0m
[31mpaste-9c4c03b0a6345aaa754b5767e49078d8f7af2fe2.png[0m
[31mpaste-cfae8b5068b640cdbd4a2e2fbbe3dcdbc08a3ac6.jpg[0m
[31mimages-b98b8556ba0a25e88d1579b2ffc16a79c139c395.jpg[0m
[31mpaste-6ba10c81153adc71fe825e59ebf26ff0f720307c.png[0m
[31mpaste-62f481c52aa82b6eaa5daa5b73639b2703d21517.jpg[0m


100%|██████████| 200/200 [00:00<00:00, 8248.71it/s]
ls: images: No such file or directory


Searching for [31mabstinent[0m...
Searching for [31marachnid[0m...
Searching for [31mbroach[0m...
Searching for [31mcalamity[0m...
Searching for [31mcanard[0m...
Searching for [31mcongregate[0m...
Searching for [31mconsonance[0m...
Searching for [31mdeference[0m...
Searching for [31mdespondent[0m...
Searching for [31mdilettante[0m...
Searching for [31mdissemble[0m...
Searching for [31memollient[0m...
Searching for [31mephemeral[0m...
Searching for [31mfeign[0m...
Searching for [31mflabby[0m...
Searching for [31mfraught[0m...
Searching for [31mgaucherie[0m...
Searching for [31mgrueling[0m...
Searching for [31mhavoc[0m...
Searching for [31mimmolate[0m...
Searching for [31mimpassive[0m...
Searching for [31mlassitude[0m...
Searching for [31mluminary[0m...
Searching for [31mmalcontent[0m...
Searching for [31mmeek[0m...
Searching for [31mmesmerism[0m...
Searching for [31mmuzzy[0m...
Searching for [31mpallid[0m...
Searching for [31mpalt

100%|██████████| 7/7 [00:00<00:00, 52.70it/s]


20250112(01:54:00)
[main a576105] update cards.html on date 20250112(01:54:00)
 14 files changed, 237 insertions(+), 110 deletions(-)
 create mode 100644 docs/images/Guinea-asks-bauxite-miners-to-present-local-refinery-plans-by-May.jpeg
 create mode 100644 docs/images/images-5186a2c0d72bb6ced6ab1099107be5415e7b0709.jpg
 create mode 100644 docs/images/images-b98b8556ba0a25e88d1579b2ffc16a79c139c395.jpg
 delete mode 100644 docs/images/images-e232c45b1db6f1af89fc1688113275e37baaa329.jpg
 delete mode 100644 docs/images/paste-3734f2bb9ca20bb7bd2e2c08e753015ab130772f.jpg
 create mode 100644 docs/images/paste-62f481c52aa82b6eaa5daa5b73639b2703d21517.jpg
 create mode 100644 docs/images/paste-6ba10c81153adc71fe825e59ebf26ff0f720307c.png
 create mode 100644 docs/images/paste-9c4c03b0a6345aaa754b5767e49078d8f7af2fe2.png
 delete mode 100644 docs/images/paste-c0dd42e84aaaf23e322d5e9b978e109d75b170c6.jpg
 create mode 100644 docs/images/paste-cfae8b5068b640cdbd4a2e2fbbe3dcdbc08a3ac6.jpg
 delete mode 

To github.com:pengfeigao2021/anki_html.git
 + a616b22...a576105 main -> main (forced update)


# -------------------------------------
# Offline batch similar words
# -------------------------------------

In [None]:
# read sqlite3
import sqlite3
import anki
from anki.collection import Collection
import re
import os
import zipfile
import termcolor
import tqdm
import csv
import shutil
# import pandas as pd
import editdistance

EDICT_PATH = '/Users/AlexG/Documents/GitHub/mindmaps/anki_tools/data/ecdict.csv'
MEDIA_FOLDER = '/Users/AlexG/Library/Application Support/Anki2/User 1/collection.media'
PKG_PATH = '/Users/AlexG/Downloads/MyPaperNotes.apkg'
PKG_PATH = '/Users/AlexG/Downloads/ivl10.apkg'
PKG_PATH = '/Users/AlexG/Downloads/极品GRE红宝书.apkg'
outdir = PKG_PATH + '-decompress'
# outdir = '/Users/AlexG/Downloads/ivl10-decompress'
# ANKI2_PATH = '/Users/AlexG/Downloads/ivl10-decompress/collection.anki2'
ANKI21_PATH = outdir + '/collection.anki21'
# EXPORT_PATH_WEBSITE = '/Users/AlexG/Documents/GitHub/typingpractices/website/onepager/onepager/templates/onepager/cards.html'
EXPORT_PATH_WEBSITE = '/Users/AlexG/Documents/GitHub/anki_html/docs/gre_cards.html'

def unzip_anki(pkg_file_path, outdir):
    # Create the extract directory if it doesn't exist
    os.makedirs(outdir, exist_ok=True)

    # Unzip the .apkg file
    with zipfile.ZipFile(PKG_PATH, 'r') as zip_ref:
        zip_ref.extractall(outdir)

    print(f"Extracted files to {outdir}")

class EditDistanceFinder(object):
    def __init__(self, words):
        self.words = words

    def find(self, query, 
             edit_distance_lower=1, 
             edit_distance_upper=1,
             html_class="sim1"):
        res = []
        for w in self.words:
            if w[0] == query:
                continue
            d = editdistance.eval(w[0], query)
            if d >= edit_distance_lower and d <= edit_distance_upper:
                text = f'<div class="{html_class}">{w[0]}: {w[2]}</div>'
                res.append(text)
        
        return ' '.join(res)
            
        
class SubStrFinder(object):
    def __init__(self, words):
        self.words = words

    def load_edict(self, path):
        total_lines = 0
        with open(path, 'r') as f:
            rd = csv.reader(f)
            for cols in tqdm.tqdm(rd):
                if len(cols[0]) < 4 or len(cols[3]) == 0:
                    continue
                if 'abbr.' in cols[3]:
                    continue
                if cols[0].isupper():
                    continue
                fields = (cols[0], cols[1], cols[3])
                self.words.append(fields)
                total_lines += 1
        print(f"Loaded {total_lines} lines from {path}")

    def find(self, query, html_class="simsub"):
        res = []
        print("Searching for {}...".format(termcolor.colored(query, 'red')))
        for w in tqdm.tqdm(self.words):
            if w[0] == query:
                continue
            if w[0].lower() in query.lower():
                text = f'<div class="{html_class}"><b>{w[0]}({w[1]})</b>: {w[2]}</div>'
                res.append(text)
        
        return ' '.join(res)

class ImageSrcUrls(object):
    def __init__(self):
        self.purge_script_path = '/Users/AlexG/Documents/GitHub/anki_html/run_images.sh'
        self.git_push_script_path = '/Users/AlexG/Documents/GitHub/anki_html/run_upload.sh'
        self.source_image_folder = MEDIA_FOLDER
        self.dest_image_folder = '/Users/AlexG/Documents/GitHub/anki_html/docs/images'
        self.IMG_PAT = re.compile(r'<img\b.*\bsrc="')
        self.SRC_PAT = re.compile(r'<img\b.*\bsrc="([^"]+)"')
        self.srcs = []
        self.exts = set()

    def replace_image_url(self, text):
        res = self.SRC_PAT.search(text)
        if res:
            for s in res.groups():
                self.srcs.append(s)
                ext = os.path.splitext(s)[1]
                self.exts.add(ext)
                print(termcolor.colored(s, 'red'))

        return self.IMG_PAT.sub('<img src="./images/', text)
    
    def purge_images(self):
        os.system(f"bash {self.purge_script_path}") 

    def git_push(self):
        os.system(f"bash {self.git_push_script_path}")
        
    def move_images(self):
        self.purge_images()
        for src in tqdm.tqdm(self.srcs):
            shutil.copyfile(
                os.path.join(self.source_image_folder, src), 
                os.path.join(self.dest_image_folder, src)
            )


def format_cards_to_html(cards, collection, outpath, max_word_length=-1, max_cards=200):
    data = []
    image_src = ImageSrcUrls()
    for cid in tqdm.tqdm(cards):
        card = collection.get_card(cid)
        note = card.note()
        fields = note.fields
        # fields[1] = fields[1][:20]
        # fields = [field if len(field) <= max_text_length else field[:max_text_length] + '...' for field in fields]
        data.append((card.ivl, fields))

    edist_finder = EditDistanceFinder([fields for _, fields in data])
    substr_finder = SubStrFinder([fields for _, fields in data])
    substr_finder.load_edict(EDICT_PATH)
    data.sort()
    data = [v for v in data if max_word_length == -1 or len(v[1][0]) <= max_word_length]
    data = [[image_src.replace_image_url(field) for field in fields] 
            for _, fields in data[:max_cards]]
    data = [['<div>{}</div><div>{}</div>'.format(fields[0], fields[1]), 
             '<div class="textscroll">{} {} {}</div>'.format(
                edist_finder.find(fields[0]),
                edist_finder.find(fields[0],
                                  edit_distance_lower=2, 
                                  edit_distance_upper=2,
                                  html_class="sim2"),
                substr_finder.find(fields[0]),
             ),
             fields[2], 
             ] for fields in data]
    print(image_src.exts)
    print(termcolor.colored('Data samples:', 'blue'))
    print('\n'.join(['{}'.format(v) for v in data[:10]]))

unzip_anki(PKG_PATH, outdir)
col = Collection(ANKI21_PATH)
# Fetch all decks in the collection
decks = col.decks.all()
print("Decks in collection:")
for deck in decks:
    print(f"Deck ID: {deck['id']}, Name: {deck['name']}")

try:
    print(dir(col))
    print('total card count: ', col.card_count())
    # print('note count: ', col.node_count())

    # query = 'prop:ivl<=30'
    query = ''
    cards = col.find_cards(query)
    print(termcolor.colored(f'query {query} find {len(cards)} cards', 'green'))

    outpath = os.path.join(outdir, 'cards.html')
    # format_cards_to_html(cards, col, outpath)
    format_cards_to_html(cards, col, EXPORT_PATH_WEBSITE,
                         max_word_length=-1)

    
except Exception as e:
    print(f"Error: {e}")
finally:
    # Close the collection
    col.close()

col.close()

# ====================
# build all words list
# ====================


In [4]:
# read sqlite3
import sqlite3
import anki
from anki.collection import Collection
import re
import os
import zipfile
import termcolor
import tqdm
import csv
import json
import shutil
# import pandas as pd
import editdistance

EDICT_PATH = '/Users/AlexG/Documents/GitHub/mindmaps/anki_tools/data/ecdict.csv'
PKG_PATH = '/Users/AlexG/Downloads/极品GRE红宝书.apkg'
outdir = PKG_PATH + '-decompress'
ANKI21_PATH = outdir + '/collection.anki21'

class SubStrFinder(object):
    def __init__(self, words):
        self.words = words

    def load_edict(self, path):
        total_lines = 0
        with open(path, 'r') as f:
            rd = csv.reader(f)
            for cols in tqdm.tqdm(rd):
                if len(cols[0]) < 4 or len(cols[3]) == 0:
                    continue
                if 'abbr.' in cols[3]:
                    continue
                if cols[0].isupper():
                    continue
                fields = (cols[0], cols[1], cols[3])
                self.words.append(fields)
                total_lines += 1
        print(f"Loaded {total_lines} lines from {path}")


col = Collection(ANKI21_PATH)
# Fetch all decks in the collection
decks = col.decks.all()
all_words_set = set()
all_words = []
gre_words = []
print("Decks in collection:")
for deck in decks:
    print(f"Deck ID: {deck['id']}, Name: {deck['name']}")

try:
    print('total card count: ', col.card_count())

    query = ''
    cards = col.find_cards(query)
    print(termcolor.colored(f'query {query} find {len(cards)} cards', 'green'))

    for cid in tqdm.tqdm(cards):
        card = col.get_card(cid)
        note = card.note()
        fields = note.fields
        all_words_set.add(fields[0])
        gre_words.append(fields[0])
    substr_finder = SubStrFinder([])
    substr_finder.load_edict(EDICT_PATH)
    for w in tqdm.tqdm(substr_finder.words):
        all_words_set.add(w)

except Exception as e:
    print(f"Error: {e}")
finally:
    # Close the collection
    col.close()

col.close()

all_words = list(all_words_set)
print(f"Total words: {len(all_words)}")
with open('/Users/AlexG/Documents/GitHub/mindmaps/anki_tools/all_words.json', 'w') as f:
    json.dump(all_words, f)
with open('/Users/AlexG/Documents/GitHub/mindmaps/anki_tools/gre_words.json', 'w') as f:
    json.dump(gre_words, f)

Decks in collection:
Deck ID: 1, Name: Default
Deck ID: 1685695741424, Name: 极品GRE红宝书
total card count:  7513
[32mquery  find 7513 cards[0m


100%|██████████| 7513/7513 [00:02<00:00, 3308.24it/s]
770612it [00:03, 229250.24it/s]


Loaded 750698 lines from /Users/AlexG/Documents/GitHub/mindmaps/anki_tools/data/ecdict.csv


100%|██████████| 750698/750698 [00:00<00:00, 1021786.20it/s]


Total words: 758211


In [9]:
import itertools
# read sqlite3
import sqlite3
import anki
from anki.collection import Collection
import re
import os
import zipfile
import termcolor
import tqdm
import csv
import json
import shutil
# import pandas as pd
import editdistance
# a = [1,2,3]
# b= itertools.product(a, a)
# print(list(b))

gre_words_path = '/Users/AlexG/Documents/GitHub/mindmaps/anki_tools/gre_words.json'
all_words_path = '/Users/AlexG/Documents/GitHub/mindmaps/anki_tools/all_words.json'
print('loading from {} and {}'.format(gre_words_path, all_words_path))
gre_words = json.load(open(gre_words_path))
all_words = json.load(open(all_words_path))
print('loaded {} gre words and {} all words'.format(len(gre_words), len(all_words)))

def find(self, query, html_class="simsub"):
    res = []
    print("Searching for {}...".format(termcolor.colored(query, 'red')))
    for w in tqdm.tqdm(self.words):
        if w[0] == query:
            continue
        if w[0].lower() in query.lower():
            text = f'<div class="{html_class}"><b>{w[0]}({w[1]})</b>: {w[2]}</div>'
            res.append(text)
    
    return ' '.join(res)

N_gre = len(gre_words)
N_all = len(all_words)
for n1, n2 in tqdm.tqdm(itertools.product(range(N_gre), range(N_all)), total=N_gre * N_all):
    w1 = gre_words[n1][0]
    w2 = all_words[n2][0]
    # d = editdistance.eval(w1, w2)
print('done')


loading from /Users/AlexG/Documents/GitHub/mindmaps/anki_tools/gre_words.json and /Users/AlexG/Documents/GitHub/mindmaps/anki_tools/all_words.json
loaded 7513 gre words and 758211 all words


  3%|▎         | 152303449/5696439243 [01:34<57:08, 1616872.64it/s]  


KeyboardInterrupt: 