In [1]:
import requests
from bs4 import BeautifulSoup
import json
import threading
import re
import numpy as np
from functools import reduce
from multiprocessing import Pool
from itertools import combinations
from time import time

In [2]:
wordup = json.load(open("/content/drive/MyDrive/Datasets/wordup/wordup_processed.json"))

### Utils

In [None]:
def print_key_structure(d, indent=0):
    for key, value in d.items():
        print('  ' * indent + str(key))
        if isinstance(value, dict):
            print_key_structure(value, indent + 1)

In [None]:
def lookup(word, wordup):
  for ele in wordup:
    if ele['root'] == word.lower(): return ele
  return None

In [None]:
def find_maximal_complete_graphs(edges):
    nodes = set()
    for edge in edges:
        nodes.update(edge)
    nodes = sorted(nodes)

    adjacency_dict = {node: set() for node in nodes}
    for edge in edges:
        node1, node2 = edge
        adjacency_dict[node1].add(node2)
        adjacency_dict[node2].add(node1)

    visited = set()
    maximal_complete_graphs = []

    for node in nodes:
        if node not in visited:
            queue = [node]
            component = set()
            while queue:
                current = queue.pop(0)
                visited.add(current)
                component.add(current)
                neighbors = adjacency_dict[current]
                for neighbor in neighbors:
                    if neighbor not in visited:
                        queue.append(neighbor)
            maximal_complete_graphs.append(component)

    return maximal_complete_graphs

In [None]:
def dict_lemmatize(wordup):
  traversed = dict()
  wordup = sorted(wordup, key= lambda word: len(word['root']))
  formations = dict()

  for word in wordup:
    traversed[word['root']] = False
  length = len(wordup)
  for i in range(length):
    if traversed[wordup[i]['root']] == True: continue

    traversed[wordup[i]['root']] = True
    formations[wordup[i]['root']] = set()

    for j in range(i+1,length):
      if traversed[wordup[j]['root']] == True: continue
      if wordup[i]['root'] in wordup[j]['root']:
        formations[wordup[i]['root']].add(wordup[j]['root'])
        traversed[wordup[j]['root']] = True
  return formations

In [None]:
formations = dict_lemmatize(wordup_long)

In [None]:
def get_pairs(wordup_processed):
  pairs = set()
  for word in wordup_processed:
    for comp in word['comparisons'].keys():
      pair = frozenset({word['root'].lower(), comp.lower()})
      comp_def = lookup(comp.lower(), wordup_processed)
      if comp_def and pair not in pairs and word['root'].lower() in [comp_comp.lower() for comp_comp in comp_def['comparisons']]:
        pairs.add(pair)
  return list(pairs)

In [None]:
def find_maximal_complete_graphs(edges):
    nodes = set()
    for edge in edges:
        nodes.update(edge)
    nodes = sorted(nodes)

    adjacency_dict = {node: set() for node in nodes}
    for edge in edges:
        node1, node2 = edge
        adjacency_dict[node1].add(node2)
        adjacency_dict[node2].add(node1)

    maximal_complete_graphs = []
    visited = set()

    def is_complete(nodes):
        for pair in combinations(nodes, 2):
            if pair[1] not in adjacency_dict[pair[0]]:
                return False
        return True

    for start_node in nodes:
        if start_node not in visited:
            stack = [start_node]
            component = set()
            while stack:
                node = stack.pop()
                if node not in visited:
                    visited.add(node)
                    component.add(node)
                    stack.extend(adjacency_dict[node])
            if is_complete(component):
                maximal_complete_graphs.append(component)

    return maximal_complete_graphs

In [None]:
def analyze_word(word):
    text = Text(word, hint_language_code='en')
    morphemes = list(text.morphemes)
    return word, morphemes

num_processes = 8

with Pool(num_processes) as pool:
    results = pool.map(analyze_word, word_list)

word_morphemes_dict = {word: morphemes for word, morphemes in results}

In [None]:
# Function to lemmatize a single word
def lemmatize_word(word_list):
    doc = nlp(word_list)
    return {token.text: token.lemma_ for token in doc}

# Load English tokenizer, tagger, parser, NER, and word vectors
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

# Number of processes to use (adjust according to your CPU)
num_processes = 8

# Flatten the word list
chunk_size = len(word_list) // num_processes
word_chunks = [" ".join(word_list[i:i+chunk_size]) for i in range(0, len(word_list), chunk_size)]

# Initialize multiprocessing pool
with Pool(num_processes) as pool:
    # Lemmatize words in parallel
    lemmas = pool.map(lemmatize_word, word_chunks)

In [None]:
lemma_dict = dict()
for lemma in lemmas:
  lemma_dict.update(lemma)

In [None]:
depd = dict()
for word_def in wordup:
  if word_def['currentWord']['wordRoot'].lower() in lemma_dict.keys():
    depd[lemma_dict[word_def['currentWord']['wordRoot'].lower()]] = set()
    for sense in word_def['senses']:
      for ele in re.findall(r'\b\w+\b', sense['de']):
        if ele.lower() in lemma_dict.keys():
          word = lemma_dict[ele.lower()]
          depd[lemma_dict[word_def['currentWord']['wordRoot'].lower()]].add(word)
        else:
          depd[lemma_dict[word_def['currentWord']['wordRoot'].lower()]].add(ele.lower())

In [None]:
with open("/content/drive/MyDrive/Datasets/dependencies.json", "r") as json_file:
  depd = json.load(json_file)

In [None]:
import networkx as nx
import matplotlib.pyplot as plt

# Create Directed Graph
G=nx.DiGraph()

# Add a list of nodes:
G.add_nodes_from(list(depd.keys()))

# Add a list of edges:
G.add_edges_from([(key, dependant) for key in depd.keys() for dependant in depd[key]])

In [None]:
def find_random_cycle(G, cycles=None, numbers=1000):
  if cycles is None:
      cycles = []

  nodes = list(G.nodes())
  for step in range(numbers):
      idx = np.random.randint(len(nodes))
      try:
          cycle = nx.find_cycle(G, source=nodes[idx])
          cycles.append(cycle)
      except nx.NetworkXNoCycle:
          pass
  unique_cycles = []
  for sublist in cycles:
      if sublist not in unique_cycles:
          unique_cycles.append(sublist)
  return unique_cycles

In [None]:
cycles = []

In [None]:
def split_words(text):
    pattern = r"\b\w+(?:'\w+)?\b|\w+"
    return re.findall(pattern, text)

# Example usage:
text = "Censure is a formal expression of disapproval or criticism, usually by an official organization or authority figure. It is applicable when someone has done something wrong or unethical and needs to be held accountable for their actions in a professional or political setting.\n\nIn simpler terms, censure is a way to express disapproval or criticism of someone's actions, while recrimination is a way to shift blame onto someone else instead of taking responsibility for one's own actions."
words = split_words(text)
print(words)

['Censure', 'is', 'a', 'formal', 'expression', 'of', 'disapproval', 'or', 'criticism', 'usually', 'by', 'an', 'official', 'organization', 'or', 'authority', 'figure', 'It', 'is', 'applicable', 'when', 'someone', 'has', 'done', 'something', 'wrong', 'or', 'unethical', 'and', 'needs', 'to', 'be', 'held', 'accountable', 'for', 'their', 'actions', 'in', 'a', 'professional', 'or', 'political', 'setting', 'In', 'simpler', 'terms', 'censure', 'is', 'a', 'way', 'to', 'express', 'disapproval', 'or', 'criticism', 'of', "someone's", 'actions', 'while', 'recrimination', 'is', 'a', 'way', 'to', 'shift', 'blame', 'onto', 'someone', 'else', 'instead', 'of', 'taking', 'responsibility', 'for', "one's", 'own', 'actions']


In [None]:
# Sample text for stemming
text = "dissection"

# Tokenize the text
words = word_tokenize(text)

# Initialize the Porter Stemmer
porter_stemmer = PorterStemmer()

# Apply stemming to each word
stemmed_words = [porter_stemmer.stem(word) for word in words]

# Print the original text and the stemmed text
print("Original Text:")
print(text)
print("\nStemmed Text:")
print(" ".join(stemmed_words))
s

Original Text:
dissection

Stemmed Text:
dissect
