In [1]:
import this

The Zen of Python, by Tim Peters

Beautiful is better than ugly.
Explicit is better than implicit.
Simple is better than complex.
Complex is better than complicated.
Flat is better than nested.
Sparse is better than dense.
Readability counts.
Special cases aren't special enough to break the rules.
Although practicality beats purity.
Errors should never pass silently.
Unless explicitly silenced.
In the face of ambiguity, refuse the temptation to guess.
There should be one-- and preferably only one --obvious way to do it.
Although that way may not be obvious at first unless you're Dutch.
Now is better than never.
Although never is often better than *right* now.
If the implementation is hard to explain, it's a bad idea.
If the implementation is easy to explain, it may be a good idea.
Namespaces are one honking great idea -- let's do more of those!


In [6]:
#version 0.1.
# disambiguate homographs of differnet lgs,
# homonyms within the same lg
# overall more precise scraping
# remove col "links" - takes up too much space on disk. better to merge on word+disamb_nr 

from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import re
import numpy as np
import shutil
import logging
logging.basicConfig(filename='parser.log', encoding='utf-8', level=logging.DEBUG)

class Scrape():
    
    def __init__(self):
        self.maxthreads = 100
        self.gloss = []
        self.ipa = []
        self.etym = []
        self.l1 = ""

    def download_url(self, url):
        html = get(url)
        soup = BeautifulSoup(html.text, 'html.parser')        
        word = url.rsplit('/', 1)[1]
        gloss = []
        ipa = []
        etym= []
        addcut = 0
        ety = ""
        theipa = ""
        
        for h2 in soup.find_all("h2"): #Language headers like "English", "Hawaiian", "Maori"
            h2L1 = h2.find("span", {"id": self.l1}) #find the heading of our target lg
            if h2L1:
                for sib in h2L1.parent.next_siblings: #loop through all siblings ("next sibling doesnt work")
                        
                    if sib.name == 'h2': #dont look for siblings beyond the next header (= the next language)
                        break

                    if sib.name == "h3":
                     
                        if "Pronunciation" in sib.text:
                            #pro = sib.find_next_sibling("ul")
                            for pro in sib.next_siblings:
                                if len(ipa)==1:
                                    break
                                if pro.name == "ul":
                                    for li in pro.find_all("li"):
                                        theipa = li.find("span", {"class": "IPA"})
                                        if theipa:
                                            ipa.append(theipa.text)
                                            break

                                elif pro.name == "h2":
                                    ipa.append("")
                                    break

                        if "Etymology" in sib.text:
                            #glo = sib.find_next_sibling("ol")
                            #if glo:
                            for glo in sib.next_siblings:
                                
                                if glo.name == "p":                   
                                    if "English" in glo.text: #some etymons occure twice, make sure to grab the one that is an LW from English
                                        try:
                                            ety = glo.i.text
                                        except:
                                            pass
                                            
                                elif glo.name == "ol":
                                    try:
                                        #gloss.append([i.text for i in glo.find_all("li")])
                                        glo = re.sub(r"\n", ", ", glo.text)
                                        addcut = glo[50:].find(" ")
                                        if addcut == -1:
                                            gloss.append(glo)
                                            etym.append(ety)
                                        else:
                                            gloss.append(glo[:50+addcut])
                                            etym.append(ety)
                                    except AttributeError:
                                        gloss.append("")
                                        etym.append(ety)
            
                                elif glo.name == "h2":
                                    break
                        


                if ipa == []:
                    ipa = [""]

                if gloss == []:
                    #if word == "hana":
                     #   print("list empty")
                      #  if etym == []:
                       #     print("etym also empty")
                        #else:
                         #   print("Here's the mistake: etym is not empty")
                    for sib in h2L1.parent.next_siblings: #loop through all siblings ("next sibling doesnt work")
                        if sib.name == "h2":
                            break
                        if sib.name == "ol":
                            #if word == "hana":
                             #   print(gloss, etym)
                            gloss.append(sib.li.text)
                            etym.append(ety)
                            #if word == "hana":
                             #   print(gloss, etym)

                            
                if gloss == []:
                    #if word == "hana":
                     #   print("yup, entered this weird spot here")
                    gloss = [""]
                    etym = [""]
                
                ipa = ipa*len(gloss)
                self.gloss.append((gloss, word))
                self.ipa.append((ipa, word))
                self.etym.append((etym, word))
                    
    def download_info(self, lg, url_list):
        threads = min(self.maxthreads, len(url_list))
        with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
            executor.map(self.download_url, url_list)

    def main(self, lg, url_list):
        self.download_info(lg, url_list)

def main(lglist):
    scr = Scrape()
    lglist = [i for i in lglist if i !="English"]
    for lg in lglist:
        scr.ipa = []
        scr.gloss = []
        scr.etym = []
        scr.l1=re.sub(" ", "_", lg)
        lg = scr.l1.lower()
        print(lg)
        file = f"C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\loanpy11\\tests_phd\\test_loanfinder\\false_positives\\lgs\\{lg}.txt"
        
        if "proto" in lg or "ancient" in lg:
            shutil.move(file, f"C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\loanpy11\\tests_phd\\test_loanfinder\\false_positives\\lgs\\proto_lgs\\{lg}.txt")
            continue
        
        try:
            dflg = pd.read_csv(file, header=None, sep="\n")
        except pd.errors.EmptyDataError:
            shutil.move(file, f"C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\loanpy11\\tests_phd\\test_loanfinder\\false_positives\\lgs\\empty_lists\\{lg}.txt")
            continue
        except FileNotFoundError:
            continue
            
        dflg.columns = ["L2_orth"]
        dflg["L2_orth"] = [str(i) for i in dflg["L2_orth"]]
        dflg = dflg[~dflg["L2_orth"].str.contains(lg)].reset_index(drop=True)
        dflg = dflg.replace(r'^\s*$', np.nan, regex=True).dropna()     
        url_list = [re.sub(" ", "_", f"https://en.wiktionary.org/wiki/{i}") for i in dflg["L2_orth"]]
        scr.main(lg, url_list)
        
        dfipa = pd.DataFrame(scr.ipa, columns = ['L2_ipa', 'L2_orth'])
        dfgloss = pd.DataFrame(scr.gloss, columns = ['L2_gloss', 'L2_orth'])
        dfetym = pd.DataFrame(scr.etym, columns = ['L2_etym', 'L2_orth'])
        
        dflg = dflg.merge(dfipa, left_on='L2_orth', right_on='L2_orth')
        dflg = dflg.merge(dfgloss, left_on='L2_orth', right_on='L2_orth')
        dflg = dflg.merge(dfetym, left_on='L2_orth', right_on='L2_orth')
            
        try: #usually this works, but sometimes there are 5-20 wrong rows
            dflg = dflg.explode(["L2_ipa", "L2_gloss", "L2_etym"])
        except ValueError: # pad wrong lengths
            ip, gl, et = [], [], []
            for nr, (i,g,e) in enumerate(zip(dflg["L2_ipa"], dflg["L2_gloss"], dflg["L2_etym"])):
                if len(i) == len(g) and len(g) == len(e):
                    ip.append(i)
                    gl.append(g)
                    et.append(e)
                else:
                    padi = [i[0]]+[""]*(len(g)-1)
                    pade = [e[0]]+[""]*(len(g)-1)
                    ip.append(padi)
                    gl.append(g)
                    et.append(pade)
                    logging.debug(f'different len in {lg}.txt for gloss {g} in row {nr}. Turned {i} to {padi} and {e} to {pade}')

            dflg["L2_ipa"], dflg["L2_gloss"], dflg["L2_etym"] = ip, gl, et
            dflg = dflg.explode(["L2_ipa", "L2_gloss", "L2_etym"])

        dflg.to_csv(f"C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\loanpy11\\tests_phd\\test_loanfinder\\false_positives\\dfs\\{lg}.csv", encoding="utf-8", index=False)

In [7]:
#run above code
import ast

with open('lglist_full.txt', 'r', encoding="utf-8") as f:
    lglist = ast.literal_eval(f.read())

lglist = [i for i in lglist[:lglist.index("Norwegian Bokmål")] if " " in i]
main(lglist)

abenlen_ayta
abu'_arapesh
acatepec_me'phaa
aghu_tharrnggala
agusan_manobo
aka_(central_africa)
akkala_sami
alcozauca_mixtec
alemannic_german
algerian_arabic
aloápam_zapotec
amatlán_zapotec
ambala_ayta
ambonese_malay
american_sign_language
amganad_ifugao
ampari_dogon
ana_tinga_dogon
ancient_greek


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\loanpy11\\tests_phd\\test_loanfinder\\false_positives\\lgs\\ancient_greek.txt'

In [None]:
html = get("https://en.wiktionary.org/wiki/амҵ")
soup = BeautifulSoup(html.text, 'html.parser')
ipa=[]

for h2 in soup.find_all("h2"): #Language headers like "English", "Hawaiian", "Maori"
    h2L1 = h2.find("span", {"id": "Abkhaz"}) #find the heading of our target lg
    if h2L1:
        for sib in h2L1.parent.next_siblings: #loop through all siblings ("next sibling doesnt work")

            if sib.name == 'h2': #dont look for siblings beyond the next header (= the next language)
                break

            if sib.name == "h3":

                if "Pronunciation" in sib.text:
                    #pro = sib.find_next_sibling("ul")
                    for pro in sib.next_siblings:
                        if pro.name == "ul":
                            theipa = pro.li.find("span", {"class": "IPA"})
                            if theipa:
                                print(theipa.text)

In [None]:
scr = Scrape()
scr.l1 = "A-Pucikwar" 
scr.download_url("https://en.wiktionary.org/wiki/burin")

In [43]:
scr = Scrape()
scr.l1 = "Northern_Yukaghir" 
scr.download_url("https://en.wiktionary.org/wiki/адуо")

In [44]:
scr.gloss

[]

In [52]:
lst = [1,2,3,"4",5]
lst[:lst.index("4")]

[1, 2, 3]

In [None]:
#development of version 0.1
#find out how to disambiguate without preprocessing

from requests import get
from bs4 import BeautifulSoup
import re

def main(url):
    html = get(url)
    word = url.rsplit('/', 1)[1]
    soup = BeautifulSoup(html.text, 'html.parser')
    gloss = []
    etym= []
    ipa = []
    addcut = 0
    
    l1= "'Are'are"
    for h2 in soup.find_all("h2"): #Language headers like "English", "Hawaiian", "Maori"
        h2L1 = h2.find("span", {"id": f"{l1}"}) #find the heading of our target lg
        if h2L1:
            for sib in h2L1.parent.next_siblings: #loop through all siblings ("next sibling doesnt work")
                if sib.name == "h3":
                    
                    if "Pronunciation" in sib.text:
                        pro = sib.find_next_sibling("ul")
                        if pro:
                            ipa.append(pro.li.span.text)
                        else:
                            ipa.append("")
                            
                    if "Etymology" in sib.text:
                
                        glo = sib.find_next_sibling("ol")
                        if glo:
                            try:
                                #gloss.append([i.text for i in glo.find_all("li")])
                                glo = re.sub(r"\n", ", ", glo.text)
                                addcut = glo[50:].find(" ")
                                if addcut == -1:
                                    gloss.append(glo)
                                else:
                                    gloss.append(glo[:50+addcut])
                            except AttributeError:
                                gloss.append("")
                        else:
                            gloss.append("")
                           
                        ety = sib.find_next_sibling("p")
                        if ety:
                            if "English" in ety.text: #some etymons occure twice, make sure to grab the one that is an LW from English
                                try:
                                    etym.append(ety.i.text) #this is the final info we want
                                except:
                                    etym.append("")
                            else:
                                etym.append("")
                        else:
                            etym.append("")
                            
                elif sib.name == 'h2': #dont look for siblings beyond the next header (= the next language)
                    break
                    
            if ipa == []:
                ipa = [""]

            if gloss == []:
                for sib in h2L1.parent.next_siblings: #loop through all siblings ("next sibling doesnt work")
                    if sib.name == "ol":                
                        gloss.append(sib.li.text) 
                        
    ipa = ipa*len(gloss)
    gloss, ipa, etym = (gloss, word), (ipa, word), (etym, word)
        
    return gloss, ipa, etym
main("https://en.wiktionary.org/wiki/teteku")

In [None]:
#version 0.0 - works.
#a bit buggy: main problem = homonyms

from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from tqdm import tqdm
import concurrent.futures
import re
import numpy as np
import shutil

class Scrape():
    
    def __init__(self):
        self.maxthreads = 100
        self.gloss = []
        self.ipa = []

    def download_url(self, url):
        html = get(url)
        soup = BeautifulSoup(html.text, 'html.parser')
        try:
            self.gloss.append((soup.ol.li.text, url))
        except: 
            self.gloss.append(("", url))
        try:
            self.ipa.append((soup.find("span", class_="IPA").text, url))
        except:
            self.ipa.append(("-", url))
    
    def download_info(self, lg_list, url_list):
        for lg in lg_list: #delete this line and input lg instead of lg_list. bc I'm looping alredy in main()
            threads = min(self.maxthreads, len(url_list))
            with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
                executor.map(self.download_url, url_list)

    def main(self, lg_list, url_list):
        self.download_info(lg_list, url_list)

def main(lglist):
    scr = Scrape()
    for lg in lglist:
        scr.ipa = []
        scr.gloass = []
        lg = re.sub(" ", "_", lg).lower()
        print(lg)
        file = f"C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\baseline tests\\other_lgs\\lgs\\{lg}.txt"
        
        if "proto" in lg:
            shutil.move(file, f"C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\baseline tests\\other_lgs\\lgs\\proto_lgs\\{lg}.txt")
            continue
        
        try:
            dflg = pd.read_csv(file, header=None, sep="\n")
        except pd.errors.EmptyDataError:
            shutil.move(file, f"C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\baseline tests\\other_lgs\\lgs\\empty_lists\\{lg}.txt")
            continue
        except FileNotFoundError:
            continue
            
        dflg.columns = ["L2_orth"]
        dflg["L2_orth"] = [str(i) for i in dflg["L2_orth"]]
        dflg["L2_orth"] = [re.sub(f"Reconstruction:{lg}\/", "", i) for i in dflg["L2_orth"]]
        dflg = dflg[~dflg["L2_orth"].str.contains(lg)].reset_index(drop=True)
        dflg["L2_link"] = [re.sub(" ", "_", f"https://en.wiktionary.org/wiki/{i}#{lg}") for i in dflg["L2_orth"]]
        dflg = dflg.replace(r'^\s*$', np.nan, regex=True).dropna()
        scr.main([lg], dflg["L2_link"])
        dfipa = pd.DataFrame(scr.ipa, columns = ['L2_ipa', 'L2_link'])
        dfgloss = pd.DataFrame(scr.gloss, columns = ['L2_gloss', 'L2_link'])
        dflg = dflg.merge(dfipa, left_on='L2_link', right_on='L2_link')
        dflg = dflg.merge(dfgloss, left_on='L2_link', right_on='L2_link')
        dflg.to_csv(f"C:\\Users\\Viktor\\OneDrive\\PhD cloud\\Vorgehensweisen\\baseline tests\\other_lgs\\dfs\\{lg}.csv", encoding="utf-8", index=False)

In [None]:
#run above code
import ast

with open('lglist_full.txt', 'r', encoding="utf-8") as f:
    lglist = ast.literal_eval(f.read())

main(lglist[:2])