In [1]:
import requests
from bs4 import BeautifulSoup
import json
import threading
import re
import numpy as np
from functools import reduce
from multiprocessing import Pool
from itertools import combinations
from time import time
import zipfile
import networkx as nx
import matplotlib.pyplot as plt

# Utils

In [2]:
def read_json_from_zip(zip_file_path, json_file_name):
    """
    Function to read a JSON file from a ZIP archive.

    Parameters:
        zip_file_path (str): Path to the ZIP archive.
        json_file_name (str): Name of the JSON file within the ZIP archive.

    Returns:
        dict: Dictionary containing the JSON data.
    """
    try:
        # Open the zip file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Extract the JSON file from the zip archive
            with zip_ref.open(json_file_name) as json_file:
                # Read the JSON data
                json_data = json.load(json_file)
        return json_data
    except Exception as e:
        print(f"Error reading JSON from ZIP: {e}")
        return None

In [3]:
def split_words(text):
    pattern = r"\b\w+(?:'\w+)?\b|\w+"
    return re.findall(pattern, text)

In [4]:
def print_key_structure(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict):
            print_key_structure(value, indent + 1)

# Wordup Analysis

In [5]:
def lookup(word, wordup):
  for ele in wordup:
    if ele['root'] == word.lower(): return ele
  return None

In [6]:
def find_maximal_complete_graphs(edges):
    nodes = set()
    for edge in edges:
        nodes.update(edge)
    nodes = sorted(nodes)

    adjacency_dict = {node: set() for node in nodes}
    for edge in edges:
        node1, node2 = edge
        adjacency_dict[node1].add(node2)
        adjacency_dict[node2].add(node1)

    visited = set()
    maximal_complete_graphs = []

    for node in nodes:
        if node not in visited:
            queue = [node]
            component = set()
            while queue:
                current = queue.pop(0)
                visited.add(current)
                component.add(current)
                neighbors = adjacency_dict[current]
                for neighbor in neighbors:
                    if neighbor not in visited:
                        queue.append(neighbor)
            maximal_complete_graphs.append(component)

    return maximal_complete_graphs

In [7]:
wordup = read_json_from_zip("D:\Projects\play-with-words\wordup_processed.zip", "wordup_processed.json")

In [8]:
def get_all_words(wordup):
  words = set()
  for word_def in wordup:
    words.add(word_def['root'])
    for sense in word_def['senses']:
      words.update([ele.lower() for ele in split_words(sense['de'])])
  return words

In [9]:
def get_dependencies(wordup, lemmatize=True):
  depd = dict()
  for word_def in wordup:
      depd[word_def['root'].lower()] = set()
      for sense in word_def['senses']:
        for ele in re.findall(r'\b\w+\b', sense['de']):
            depd[word_def['root'].lower()].add(ele.lower())
  return depd

In [10]:
def find_random_cycle(G, cycles=None, numbers=1000):
  if cycles is None:
      cycles = []

  nodes = list(G.nodes())
  for step in range(numbers):
      idx = np.random.randint(len(nodes))
      try:
          cycle = nx.find_cycle(G, source=nodes[idx])
          cycles.append(cycle)
      except nx.NetworkXNoCycle:
          pass
  unique_cycles = []
  for sublist in cycles:
      if sublist not in unique_cycles:
          unique_cycles.append(sublist)
  return unique_cycles

In [11]:
words = get_all_words(wordup)

In [12]:
len(words)

34261

In [40]:
def get_dependants(word_list, wordup):
  result = set()
  for word_def in wordup:
    if word_def['root'] in word_list:
      for sense in word_def['senses']:
        result.update([ele.lower() for ele in re.findall(r'\b\w+\b', sense['de']) if ele.isalpha()])
  return result

In [None]:
def get_dependers():
  return

In [45]:
def get_minimal_wordlist(word_list, wordup):
  result = set()
  iter = 0

  print(f"Original list length: {len(word_list)}")

  while len(result) <= len(word_list) and iter < 20:
    result = get_dependants(word_list, wordup)
    iter += 1
    print(f"Reduced list length after {iter+1} iteration(s): {len(word_list)}")
    if word_list == result: 
      print(f"Minimalization converges!")
      break
    word_list = result 
    
  
  if len(result) > len(word_list):
    print("Dependant list is longer than last list!")


  return result

In [46]:
minimal_word_list = get_minimal_wordlist([word_def['root'] for word_def in wordup], wordup)

Original list length: 24287
Reduced list length after 2 iteration(s): 24287
Reduced list length after 3 iteration(s): 19638
Reduced list length after 4 iteration(s): 14560
Reduced list length after 5 iteration(s): 13528
Reduced list length after 6 iteration(s): 13296
Reduced list length after 7 iteration(s): 13257
Reduced list length after 8 iteration(s): 13240
Reduced list length after 9 iteration(s): 13236
Minimalization converges!


In [47]:
minimal_word_list = sorted(list(minimal_word_list))

In [48]:
minimal_word_list[:100]

['a',
 'abandon',
 'abandoned',
 'abandoning',
 'abbey',
 'abbots',
 'abbreviated',
 'abbreviation',
 'abdomen',
 'abdominal',
 'aberrant',
 'abilities',
 'ability',
 'ablative',
 'able',
 'abnormal',
 'abnormally',
 'aboard',
 'aboriginal',
 'aborigines',
 'abounding',
 'about',
 'above',
 'abrasive',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absinthe',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorbent',
 'absorbers',
 'absorbing',
 'absorbs',
 'absorption',
 'abstract',
 'absurd',
 'absurdly',
 'abundance',
 'abundant',
 'abundantly',
 'abuse',
 'abysmal',
 'acacia',
 'academic',
 'acanthus',
 'acceleration',
 'accelerator',
 'accent',
 'accented',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'accepts',
 'access',
 'accessed',
 'accesses',
 'accessories',
 'accident',
 'accidental',
 'accidentally',
 'accidents',
 'acclaim',
 'accommodation',
 'accommodations',
 'accompanied',
 'accompanies',
 'accompaniment',
 'accompany',
 'ac

In [49]:
lookup("abbey", wordup)

{'root': 'abbey',
 'senses': [{'id': '244c59ff-ac44-455e-b93c-4639af733280',
   'de': 'A building or group of buildings where monks or nuns live and pray',
   'do': None,
   'ty': 'noun',
   'ex': 'The ancient abbey has a history of over a thousand years.',
   'use': '',
   'co': None,
   'fo': None,
   'sy': 'monastery, convent',
   'op': None,
   'gr': None,
   're': '',
   'br': False,
   'am': False,
   'cl': ['abbey church', ' abbey ruins', ' medieval abbey'],
   'ImageSrc': 'https://word-images.cdn-wordup.com/senses/244c59ff-ac44-455e-b93c-4639af733280.webp?v=1'},
  {'id': '73840545-c4a5-4962-b168-4191a6f59128',
   'de': 'A church that is or was part of an abbey',
   'do': None,
   'ty': 'noun',
   'ex': 'We visited the abbey during our tour of the historic city.',
   'use': '',
   'co': None,
   'fo': None,
   'sy': 'monastery church, convent church',
   'op': None,
   'gr': None,
   're': '',
   'br': False,
   'am': False,
   'cl': ['abbey church', ' historic abbey', ' old abb