In [1]:
import requests
from bs4 import BeautifulSoup
import json
import threading
import re
import numpy as np
from functools import reduce
from multiprocessing import Pool
from itertools import combinations
from time import time
import zipfile
import networkx as nx
import matplotlib.pyplot as plt
import csv

# Utils

In [2]:
def read_json_from_zip(zip_file_path, json_file_name):
    """
    Function to read a JSON file from a ZIP archive.

    Parameters:
        zip_file_path (str): Path to the ZIP archive.
        json_file_name (str): Name of the JSON file within the ZIP archive.

    Returns:
        dict: Dictionary containing the JSON data.
    """
    try:
        # Open the zip file
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Extract the JSON file from the zip archive
            with zip_ref.open(json_file_name) as json_file:
                # Read the JSON data
                json_data = json.load(json_file)
        return json_data
    except Exception as e:
        print(f"Error reading JSON from ZIP: {e}")
        return None

In [3]:
def split_words(text):
    pattern = r"\b\w+(?:'\w+)?\b|\w+"
    return re.findall(pattern, text)

In [4]:
def print_key_structure(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict):
            print_key_structure(value, indent + 1)

In [5]:
def find_maximal_complete_graphs(edges):
    nodes = set()
    for edge in edges:
        nodes.update(edge)
    nodes = sorted(nodes)

    adjacency_dict = {node: set() for node in nodes}
    for edge in edges:
        node1, node2 = edge
        adjacency_dict[node1].add(node2)
        adjacency_dict[node2].add(node1)

    visited = set()
    maximal_complete_graphs = []

    for node in nodes:
        if node not in visited:
            queue = [node]
            component = set()
            while queue:
                current = queue.pop(0)
                visited.add(current)
                component.add(current)
                neighbors = adjacency_dict[current]
                for neighbor in neighbors:
                    if neighbor not in visited:
                        queue.append(neighbor)
            maximal_complete_graphs.append(component)

    return maximal_complete_graphs

In [6]:
def find_random_cycle(G, cycles=None, numbers=1000):
  if cycles is None:
      cycles = []

  nodes = list(G.nodes())
  for step in range(numbers):
      idx = np.random.randint(len(nodes))
      try:
          cycle = nx.find_cycle(G, source=nodes[idx])
          cycles.append(cycle)
      except nx.NetworkXNoCycle:
          pass
  unique_cycles = []
  for sublist in cycles:
      if sublist not in unique_cycles:
          unique_cycles.append(sublist)
  return unique_cycles

# Wordup Analysis

In [7]:
def lookup(word, wordup):
  for ele in wordup:
    if ele['root'] == word.lower(): return ele
  return None

In [8]:
def get_all_words(wordup):
  words = set()
  for word_def in wordup:
    words.add(word_def['root'])
    for sense in word_def['senses']:
      words.update([ele.lower() for ele in split_words(sense['de'])])
  return words

In [9]:
def get_dependencies(wordup, lemmatize=True):
  depd = dict()
  for word_def in wordup:
      depd[word_def['root'].lower()] = set()
      for sense in word_def['senses']:
        for ele in re.findall(r'\b\w+\b', sense['de']):
            depd[word_def['root'].lower()].add(ele.lower())
  return depd

In [10]:
def get_dependants(word_list, wordup):
  result = set()
  for word_def in wordup:
    if word_def['root'] in word_list:
      for sense in word_def['senses']:
        result.update([ele.lower() for ele in re.findall(r'\b\w+\b', sense['de']) if ele.isalpha()])
  return result

In [11]:
def get_dependers(word_list, wordup):
  result = set()
  for word_def in wordup:
    for sense in word_def['senses']:
      for ele in re.findall(r'\b\w+\b', sense['de']):
        if ele.lower() in word_list:
          result.add(word_def['root'])
  return result

In [12]:
def get_minimal_wordlist(word_list, wordup):
  result = set()
  iter = 0

  print(f"Original list length: {len(word_list)}")

  while len(result) <= len(word_list) and iter < 20:
    result = get_dependants(word_list, wordup)
    iter += 1
    print(f"Reduced list length after {iter+1} iteration(s): {len(word_list)}")
    if word_list == result: 
      print(f"Minimalization converges!")
      break
    word_list = result 
    
  
  if len(result) > len(word_list):
    print("Dependant list is longer than last list!")


  return sorted(list(result))

# Morpheme Analysis

In [13]:
def get_morpheme_dict():
  path = "D:\Projects\play-with-words\morphemes\lookup.csv"
  morpheme_dict = dict()
  with open(path, 'r', newline='', encoding='utf-8') as csv_file:
      csv_reader = csv.reader(csv_file)
      next(csv_reader)
      for row in csv_reader:
          morpheme_dict[row[0]] = row[1].split(" ")
  return morpheme_dict

In [14]:
def get_affix_dict(param='pre' or 'suf' or all):
  if param == 'pre': path = "D:\Projects\play-with-words\morphemes\prefixes.csv"
  elif param == 'suf':  path = "D:\Projects\play-with-words\morphemes\suffixes.csv"
  else: 
    raise TypeError("Parameter must be 'pre' or 'suf'")

  result = set()
  with open(path, 'r', newline='', encoding='utf-8') as csv_file:
      csv_reader = csv.reader(csv_file)
      next(csv_reader)
      for row in csv_reader:
          result.add(row[0])
  return list(result)

In [15]:
def get_root_dict():
  morpheme_dict = get_morpheme_dict()
  root_dict = dict()

  for word, morphemes in morpheme_dict.items():
    if len(morphemes) == 1:
      root_dict[word] = morphemes[0]
      continue
    
    found = False

    for morpheme in morphemes:
      if '##' not in morpheme:
        root_dict[word] = [morpheme]
        found = True
    
    if not found:
      """ Take all prefixes as roots """
      root_dict[word] = [morpheme.replace("##", "") for morpheme in morphemes if re.match(r'.*##$', morpheme)]
      
  return root_dict

In [16]:
def get_minimal_root_list(minimal_word_list, morpheme_dict):
  minimal_root_list = set()
  for word in minimal_word_list:
    if word in root_dict.keys():
      minimal_root_list.update(root_dict[word])
    else:
      minimal_root_list.add(word)
      
  return sorted(list(minimal_root_list))

# Execution

In [17]:
wordup = read_json_from_zip("D:\Projects\play-with-words\wordup_processed.zip", "wordup_processed.json")

In [18]:
words = get_all_words(wordup)
len(words)

34261

In [19]:
minimal_word_list = get_minimal_wordlist([word_def['root'] for word_def in wordup], wordup)

Original list length: 24287
Reduced list length after 2 iteration(s): 24287
Reduced list length after 3 iteration(s): 19638
Reduced list length after 4 iteration(s): 14560
Reduced list length after 5 iteration(s): 13528
Reduced list length after 6 iteration(s): 13296
Reduced list length after 7 iteration(s): 13257
Reduced list length after 8 iteration(s): 13240
Reduced list length after 9 iteration(s): 13236
Minimalization converges!


In [20]:
morpheme_dict = get_morpheme_dict()

In [22]:
prefixes = get_affix_dict('pre')
suffixes = get_affix_dict('suf')

In [23]:
root_dict = get_root_dict()

In [24]:
minimal_root_list = get_minimal_root_list(minimal_word_list, morpheme_dict)

In [25]:
len(minimal_root_list)

7013

In [27]:
'techno' in minimal_root_list

True

In [35]:
type(minimal_root_list)

list

In [37]:
word_set = set(words)
word_set.update(minimal_root_list)
wordup_roots = set(word_def['root'] for word_def in wordup)
words_to_scrape = word_set - wordup_roots

In [40]:
with open("D:/Projects/play-with-words/to-scrape.txt", "w") as txt_file:
    for word in words_to_scrape:
        txt_file.write(word + "\n")