In [1]:
import re
import pandas as pd
import dhlab as dh
import datetime

In [2]:
pd.set_option('display.max_rows', None, 'display.max_colwidth', None)

#### Bygge korpus

In [3]:
subject = 'lingvistikk'
corp_limit = 1000
from_year = 1950

In [4]:
corpus = dh.Corpus(doctype='digibok', subject=subject, limit=corp_limit, from_year=from_year)

In [5]:
#corpus

#### Finne konkordanser

In [6]:
curr_year = datetime.datetime.today().year

In [7]:
strictness = 'moderate'
yearspan = (1000,curr_year)
conc_limit = 4000

In [8]:
tall = list(range(yearspan[0],yearspan[1]))

In [9]:
tallOR = ' OR '.join([str(x) for x in tall])

In [10]:
tallconc = dh.Concordance(corpus, tallOR, limit=conc_limit)

In [11]:
concs = tallconc.frame
concs.concordance = concs.concordance.apply(lambda x:x.replace('<b>', '').replace('</b>', '').replace('...', ''))
concs1 = concs[['urn', 'concordance']]

In [12]:
#concs1

#### Definere alle regexene

In [13]:
# first filtering, finds concordances with year in parentheses
regex1 = r'[\(;].*?\D\d{4}\D.*?[\);]'

# Strict
regex_s1 = r"(?<=[(;]\s)(?:[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:,|og|and|&)\s*)*[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:et\s*al\.|mfl\.)?\s*,?\s*\d{4}(?:\s*[a-zæøå])?(?:\s*[,:]\s*(?:[ps]\s*\.\s*)?\d{1,4}(?:\s*[,–-]\s*\d{1,4})*)?(?=\s[);])"
regex_s2 = r"(?:[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:,|og|and|&)\s*)*[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:et\s*al\.|mfl\.)?\s*\(\s*\d{4}(?:\s*[a-zæøå])?(?:\s*[,:]\s*(?:[ps]\s*\.\s*)?\d{1,4}(?:\s*[,–-]\s*\d{1,4})*)?\s*\)"

# Moderate
regex_m1 = r"(?<=[(;])\s*[^(;\d]*?[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:et\s*al\.?|m(?:ed|\s*\.?)\s*fl(?:ei?re?|\s*\.?))?\s*,?\s*\d{4}\s*[a-zæøå]?(?:\s*\[\s*\d{4}\s*\]\s*[a-zæøå]?)?(?:\s*[,:]\s*(?:[PpSs]\s*\.\s*)?\d{1,4}(?:\s*[,–-]\s*\d{1,4})*)?(?=\s[);])"
regex_m2 = r"(?:[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:,|og|and|&)\s*)*[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:et\s*al\.?|m(?:ed|\s*\.?)\s*fl(?:ei?re?|\s*\.?))?\s*\(\s*\d{4}\s*[a-zæøå]?(?:\s*\[\s*\d{4}\s*\]\s*[a-zæøå]?)?(?:\s*[,:]\s*(?:[PpSs]\s*\.\s*)?\d{1,4}(?:\s*[,–-]\s*\d{1,4})*)?\s*\)"

# Lenient
regex_l1 = r"(?:[A-ZÀ-Ž](?:[A-zÀ-ž-]+|\s*\.)\s*,?\s*)+(?:\s*(?:et\s*al\.?|m(?:ed|\s*\.?)\s*fl(?:ei?re?|\s*\.?))\s*)?,?\s*\[?\s*\d{4}\s*[a-zæøå]?\s*\.?\s*\]?(?:\s*[,;:/)\s–-]\s*(?:[PpSs]\s*\.?)?\[?\s*\d{1,4}\s*[a-zæøå]?\s*\.?\s*\]?(?:\s*[,–-]\s*\d{1,4}\s*[a-zæøå]?\s*\.?)?)*\s*[);]"
regex_l2 = r"(?<=[(;])[^(;\d]*?[A-zÀ-ž][^(;]*?\d{4}[^);]*?(?=[);])"
regex_l3 = r"(?:(?:[A-ZÀ-Ž](?:[A-zÀ-ž-]+|\s*\.)\s*,?\s*)+(?:og|and|&)\s*)?(?:[A-ZÀ-Ž](?:[A-zÀ-ž-]+|\s*\.)\s*,?\s*)+(?:\s*(?:et\s*al\.?|m(?:ed|\s*\.?)\s*fl(?:ei?re?|\s*\.?))\s*)?\s*\(\s*\d{4}\D[^)]*?\)"

# Open
regex_o1 = r"(?<=[(;])[^(;\d]*?[A-zÀ-ž][^(;]*?\d{4}[^);]*?(?=[);])"
regex_o2 = r"(?:[^()\s]+\s+){1,5}\(\s*\d{4}[^)]*?\)"

# NOU, StMeld, Prop
regex_nou = r"(?:(?:NOU|nou)\s*\(?\s*\d{4}\s*:\s*\d{1,4}|(?:St\s*\.?\s*)?Meld\s*\.?\s*(?:St\s*\.?\s*)?(?:nr\s*\.?\s*)?\d{1,3}\s*\(?\s*\d{4}\s*-\s*\d{4}\s*\)?|Prop\s*\.?\s*\d{1,3}\s*[A-ZÆØÅ]\s*\(?\s*\d{4}\s*-\s*\d{4}\s*\)?)(?:\s*[,(]?\s*[PpSs]\s*\.?\s*\d{1,4})?"


#### Definere findone

In [14]:
def findone(regx, s):
    res = re.findall(regx, s)
    try:
        r = res[0]
    except:
        r = "itj no"
    return r

#### Lage concs2 som utgangspunkt

In [None]:
concs1['parentes'] = concs1.concordance.apply(lambda x: findone(regex1, x))

In [15]:
concs2 = concs1[concs1['parentes'] != 'itj no']
concs2 = concs2[['urn', 'concordance']]

#### Forberede concs2 for lenient

In [None]:
concs2['parentes'] = concs2.concordance.apply(lambda x: findone(regex_l1, x))

In [None]:
concs_l = concs2[concs2['parentes'] != 'itj no']
concs_l = concs_l[['urn', 'concordance']]

#### Definere match_and_explode

In [18]:
def match_and_explode(c, regex):
    match = []
    
    for i in c.values:
        m = re.findall(regex, i[1])
        if m != []:
            match.append((i[0], m))

    match_df = pd.DataFrame(match)
    match_explode = match_df.explode(column=1)
    
    return match_explode

#### If-elif-else

In [19]:
strictness = 'lenient'

if strictness == 'strict':
    i_parentes = match_and_explode(concs2, regex_s1)
    u_parentes = match_and_explode(concs2, regex_s2)
        
elif strictness == 'moderate':
    i_parentes = match_and_explode(concs2, regex_m1)
    u_parentes = match_and_explode(concs2, regex_m2)
            
elif strictness == 'lenient':                
    i_parentes = match_and_explode(concs_l, regex_l2)
    u_parentes = match_and_explode(concs2, regex_l3)
                
elif strictness == 'open':
    i_parentes = match_and_explode(concs2, regex_o1)
    u_parentes = match_and_explode(concs2, regex_o2)
                    
else:
    print("Strictness argument is not valid")
    


#### Kjøre nou_stmeld_prop

In [20]:
noustp = match_and_explode(concs1, regex_nou)

#### Legge sammen og sortere

In [21]:
match_concat = pd.concat([i_parentes, u_parentes, noustp], axis=0, ignore_index=True)
match_sorted = match_concat.sort_values(by=0, ignore_index=True)

In [22]:
match_sorted

Unnamed: 0,0,1
0,URN:NBN:no-nb_digibok_2007022801011,"Læstadius till Fellman , januari 1845 , eit . från Pentikåinen 1998 : 104"
1,URN:NBN:no-nb_digibok_2007022801011,Lapponia ( 1673 )
2,URN:NBN:no-nb_digibok_2007022801011,Læstadius 1840 - 1845 ] 1997
3,URN:NBN:no-nb_digibok_2007022801011,ifølgje Wikmark 1980 : 99
4,URN:NBN:no-nb_digibok_2007032301036,Transactions of the Philological Society 1947
5,URN:NBN:no-nb_digibok_2007050400063,Language Conflict and Language Planning ( 1966 )
6,URN:NBN:no-nb_digibok_2007051504031,Sprache ( 1851 )
7,URN:NBN:no-nb_digibok_2007051504031,"Sprache ( 1782 , 4 th . cd . 1796 )"
8,URN:NBN:no-nb_digibok_2007051504031,Sprache ( 1865 )
9,URN:NBN:no-nb_digibok_2007051504031,Darwin ( 1894 )
