In [2]:
import requests
from bs4 import BeautifulSoup
import json
import threading
import re
import numpy as np
from functools import reduce
from multiprocessing import Pool
from itertools import combinations

In [2]:
threads = []
num_threads = 20

In [11]:
def scrape_word(word):
  search_for = word.lower().replace(" ", "-")
  url = f"https://www.wordupapp.co/dictionary/{search_for}"
  response = requests.get(url)

  if response.status_code == 200:
    soup = BeautifulSoup(response.text, 'html.parser')
    json_tag = soup.find('script', {'type': 'application/json'})

    if json_tag:
      json_data = json.loads(json_tag.text)
      json_data['protips'] = dict()

      pro_tip = soup.find('div', {'class': 'ProTips_container__HLam_'})
      for child in list(pro_tip.children):
        grandchildren = list(child.children)
        json_data['protips'][grandchildren[1].text] = grandchildren[2].text
    else:
      print(f"No tag with type='application/json' of {search_for}.")

  else:
    print(f"Failed to retrieve {search_for}")

  return json_data

In [12]:
data = scrape_word('abandon')

In [13]:
data

{'props': {'pageProps': {'currentWord': {'wordRoot': 'abandon',
    'id': 5966,
    'phonemic': 'É\x99Ë\x88bÃ¦ndÉ\x99n|É\x99Ë\x88bÃ¦ndÉ\x99n\r',
    'bigId': 'a4456971-8ef0-4903-859d-5bca5ee0280d'},
   'senses': [{'id': 'a4456971-8ef0-4903-859d-5bca5ee0280d',
     'de': 'to leave someone, especially when they need you',
     'ty': 'verb',
     'ex': 'He abandoned his family.',
     'use': '',
     'sy': '',
     'op': '',
     're': '',
     'tp': '',
     'cl': ['abandon family', ' abandon friend', ' abandon love'],
     'ImageSrc': 'https://word-images.cdn-wordup.com/senses/a4456971-8ef0-4903-859d-5bca5ee0280d.webp?v=1',
     'Tips': [{'title': 'Emotional Weight',
       'description': "Using 'abandon' can evoke strong feelings as it implies neglect or betrayal.",
       'example': "Children felt abandoned when their parents didn't show up for the play.\r",
       'imageUrl': 'https://word-images.cdn-wordup.com/tips/qgw6hw3qQOwi20McnsvzZ8hajbA.webp'},
      {'title': 'Not Temporary',

In [15]:
def scrape_thread(num, word_list, scraped_word):
    length = len(word_list)
    total = 0
    for i in range(length):
        if i % num_threads == num:
            search_for = word_list[i].lower().replace(" ", "-")
            url = f"https://www.wordupapp.co/dictionary/{search_for}"
            response = requests.get(url)

            # Check if the request was successful (status code 200)
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                json_tag = soup.find('script', {'type': 'application/json'})

                if json_tag:
                    json_data = json.loads(json_tag.text)
                    json_data['protips'] = dict()

                    pro_tip = soup.find('div', {'class': 'ProTips_container__HLam_'})
                    for child in list(pro_tip.children):
                        grandchildren = list(child.children)
                        json_data['protips'][grandchildren[1].text] = grandchildren[2].text

                    scraped_word.append(json_data)
                    total += 1
                    if total % 100 == 0:
                        print(f"Thread {i} reached {total} words")
                else:
                    print(f"No tag with type='application/json' of {search_for}.")
            else:
                print(f"Failed to retrieve {search_for}")

In [None]:
print(len(scraped_word))
for i in range(num_threads):
    t = threading.Thread(target=scrape_thread, args=(i, word_list, scraped_word))
    threads.append(t)
    t.start()

for t in threads:
    t.join()

print("All tasks completed")
print(len(scraped_word))

In [None]:
filtered_scraped_words = [word for word in scraped_word if 'wordRoot' in word.get('props', {}).get('pageProps', {}).get('currentWord', {})]

In [None]:
wordup_processed = [{'root':word['props']['pageProps']['currentWord']['wordRoot'],
                     'senses':word['props']['pageProps']['senses'],
                     'comparisons':word['props']['pageProps']['comparisons']}
                    for word in new_wordup]

In [None]:
def get_graph(wordup):
  G = nx.DiGraph()
  nodes = list()
  edges = list()
  for word_def in wordup:
    nodes.append(word_def['root'].lower())
    for comp in word_def['comparisons']:
      edges.append((word_def['root'].lower(), comp.lower()))
  G.add_nodes_from(nodes)
  G.add_edges_from(edges)
  return G

In [None]:
def get_unscraped(graph):
  result = set()
  all_nodes = set()
  presented = list(graph.keys())
  for value in graph.values():
    all_nodes.update(value)

  for node in all_nodes:
    if node not in presented:
      result.add(node)
  return list(result)

In [None]:
def get_unfound(wordup):
    words = set()
    for word in wordup:
      words.add(word['currentWord']['wordRoot'].lower())
      for comp in word['comparisons'].keys():
        words.add(comp.lower())

    unfound = set()
    for word in list(words):
      if not lookup(word, wordup):
        unfound.add(word)

    return unfound

In [None]:
def scrape(word, wordup):
  if " " in word:
    search_for = word.replace(" ", "-")
  elif "-" in word:
    search_for = word.replace("-", "3")
  else:
    search_for = word
  url = f"https://www.wordupapp.co/dictionary/{search_for}"
  response = requests.get(url)

  if response.status_code == 200:
      soup = BeautifulSoup(response.text, 'html.parser')
      json_tag = soup.find('script', {'type': 'application/json'})

      if json_tag:
          json_data = json.loads(json_tag.text)
          wordup.append(json_data)
          print(f"Scraped {search_for} successfully!")

      else:
          print(f"No tag with type='application/json' of {search_for}.")
  else:
      print(f"Failed to retrieve {search_for}")
  return wordup

In [None]:
def add_to_processed(word, wordup_raw, wordup_processed):
  for word_def in wordup_raw:
    if word_def['props']['pageProps']['currentWord']['wordRoot'] == word:
      if not lookup(word.lower(), wordup_processed):
        wordup_processed.append({'root':word_def['props']['pageProps']['currentWord']['wordRoot'],
                      'senses':word_def['props']['pageProps']['senses'],
                      'comparisons':word_def['props']['pageProps']['comparisons']})
        print(f"Add {word} successfully!")
      else:
        print(f"Already there {word}")
      return wordup_processed
  print(f"Cannot add {word}!")
  return wordup_processed