In [2]:
import requests
from bs4 import BeautifulSoup
import json
import threading
import re
import numpy as np
from functools import reduce
from multiprocessing import Pool
from itertools import combinations
from time import time
import zipfile
import networkx as nx
import matplotlib.pyplot as plt

# Utils

In [3]:
def read_json_from_zip(zip_file_path, json_file_name):
    """
    Function to read a JSON file from a ZIP archive.

    Parameters:
        zip_file_path (str): Path to the ZIP archive.
        json_file_name (str): Name of the JSON file within the ZIP archive.

    Returns:
        dict: Dictionary containing the JSON data.
    """
    try:
        # Open the zip file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Extract the JSON file from the zip archive
            with zip_ref.open(json_file_name) as json_file:
                # Read the JSON data
                json_data = json.load(json_file)
        return json_data
    except Exception as e:
        print(f"Error reading JSON from ZIP: {e}")
        return None

In [4]:
def split_words(text):
    pattern = r"\b\w+(?:'\w+)?\b|\w+"
    return re.findall(pattern, text)

In [5]:
def print_key_structure(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict):
            print_key_structure(value, indent + 1)

# Wordup Analysis

In [6]:
def lookup(word, wordup):
  for ele in wordup:
    if ele['root'] == word.lower(): return ele
  return None

In [7]:
def find_maximal_complete_graphs(edges):
    nodes = set()
    for edge in edges:
        nodes.update(edge)
    nodes = sorted(nodes)

    adjacency_dict = {node: set() for node in nodes}
    for edge in edges:
        node1, node2 = edge
        adjacency_dict[node1].add(node2)
        adjacency_dict[node2].add(node1)

    visited = set()
    maximal_complete_graphs = []

    for node in nodes:
        if node not in visited:
            queue = [node]
            component = set()
            while queue:
                current = queue.pop(0)
                visited.add(current)
                component.add(current)
                neighbors = adjacency_dict[current]
                for neighbor in neighbors:
                    if neighbor not in visited:
                        queue.append(neighbor)
            maximal_complete_graphs.append(component)

    return maximal_complete_graphs

In [11]:
wordup = read_json_from_zip("D:\Projects\play-with-words\wordup_processed.zip", "wordup_processed.json")

In [14]:
def get_all_words(wordup):
  words = set()
  for word_def in wordup:
    words.add(word_def['root'])
    for sense in word_def['senses']:
      words.update([ele.lower() for ele in split_words(sense['de'])])
  return words

In [12]:
def get_dependencies(wordup, lemmatize=True):
  depd = dict()
  for word_def in wordup:
      depd[word_def['root'].lower()] = set()
      for sense in word_def['senses']:
        for ele in re.findall(r'\b\w+\b', sense['de']):
            depd[word_def['root'].lower()].add(ele.lower())
  return depd

In [None]:
def find_random_cycle(G, cycles=None, numbers=1000):
  if cycles is None:
      cycles = []

  nodes = list(G.nodes())
  for step in range(numbers):
      idx = np.random.randint(len(nodes))
      try:
          cycle = nx.find_cycle(G, source=nodes[idx])
          cycles.append(cycle)
      except nx.NetworkXNoCycle:
          pass
  unique_cycles = []
  for sublist in cycles:
      if sublist not in unique_cycles:
          unique_cycles.append(sublist)
  return unique_cycles

In [15]:
words = get_all_words(wordup)

In [16]:
len(words)

34261

In [17]:
depd = get_dependencies(wordup)

In [18]:
dependant = set()
for key, value in depd.items():
  dependant.update(value)

In [19]:
len(dependant)

19836