In [24]:
import re
import pandas as pd
import dhlab as dh
import datetime

In [25]:
pd.set_option('display.max_rows', None, 'display.max_colwidth', None)

#### Bygge korpus

In [26]:
subject = 'lingvistikk'
corp_limit = 1000
from_year = 1950

In [27]:
corpus = dh.Corpus(doctype='digibok', subject=subject, limit=corp_limit, from_year=from_year)

In [28]:
#corpus

#### Finne konkordanser

In [29]:
curr_year = datetime.datetime.today().year

In [30]:
strictness = 'moderate'
yearspan = (1000,curr_year)
conc_limit = 4000

In [31]:
tall = list(range(yearspan[0],yearspan[1]))

In [32]:
tallOR = ' OR '.join([str(x) for x in tall])

In [33]:
tallconc = dh.Concordance(corpus, tallOR, limit=conc_limit)

In [34]:
concs = tallconc.frame
concs.concordance = concs.concordance.apply(lambda x:x.replace('<b>', '').replace('</b>', '').replace('...', ''))
concs1 = concs[['urn', 'concordance']]

In [35]:
#concs1

#### Definere alle regexene

In [36]:
# first filtering, finds concordances with year in parentheses
regex1 = r'[\(;].*?\D\d{4}\D.*?[\);]'

# Strict
regex_s1 = r"(?<=[(;]\s)(?:[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:,|og|and|&)\s*)*[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:et\s*al\.|mfl\.)?\s*,?\s*\d{4}(?:\s*[a-zæøå])?(?:\s*[,:]\s*(?:[ps]\s*\.\s*)?\d{1,4}(?:\s*[,–-]\s*\d{1,4})*)?(?=\s[);])"
regex_s2 = r"(?:[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:,|og|and|&)\s*)*[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:et\s*al\.|mfl\.)?\s*\(\s*\d{4}(?:\s*[a-zæøå])?(?:\s*[,:]\s*(?:[ps]\s*\.\s*)?\d{1,4}(?:\s*[,–-]\s*\d{1,4})*)?\s*\)"

# Moderate
regex_m1 = r"(?<=[(;])\s*[^(;\d]*?[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:et\s*al\.?|m(?:ed|\s*\.?)\s*fl(?:ei?re?|\s*\.?))?\s*,?\s*\d{4}\s*[a-zæøå]?(?:\s*\[\s*\d{4}\s*\]\s*[a-zæøå]?)?(?:\s*[,:]\s*(?:[PpSs]\s*\.\s*)?\d{1,4}(?:\s*[,–-]\s*\d{1,4})*)?(?=\s[);])"
regex_m2 = r"(?:[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:,|og|and|&)\s*)*[A-ZÀ-Ž][A-zÀ-ž-]+\s*(?:et\s*al\.?|m(?:ed|\s*\.?)\s*fl(?:ei?re?|\s*\.?))?\s*\(\s*\d{4}\s*[a-zæøå]?(?:\s*\[\s*\d{4}\s*\]\s*[a-zæøå]?)?(?:\s*[,:]\s*(?:[PpSs]\s*\.\s*)?\d{1,4}(?:\s*[,–-]\s*\d{1,4})*)?\s*\)"

# Lenient
regex_l1 = r"(?:[A-ZÀ-Ž](?:[A-zÀ-ž-]+|\s*\.)\s*,?\s*)+(?:\s*(?:et\s*al\.?|m(?:ed|\s*\.?)\s*fl(?:ei?re?|\s*\.?))\s*)?,?\s*\[?\s*\d{4}\s*[a-zæøå]?\s*\.?\s*\]?(?:\s*[,;:/)\s–-]\s*(?:[PpSs]\s*\.?)?\[?\s*\d{1,4}\s*[a-zæøå]?\s*\.?\s*\]?(?:\s*[,–-]\s*\d{1,4}\s*[a-zæøå]?\s*\.?)?)*\s*[);]"
regex_l2 = r"(?<=[(;])[^(;\d]*?[A-zÀ-ž][^(;]*?\d{4}[^);]*?(?=[);])"
regex_l3 = r"(?:(?:[A-ZÀ-Ž](?:[A-zÀ-ž-]+|\s*\.)\s*,?\s*)+(?:og|and|&)\s*)?(?:[A-ZÀ-Ž](?:[A-zÀ-ž-]+|\s*\.)\s*,?\s*)+(?:\s*(?:et\s*al\.?|m(?:ed|\s*\.?)\s*fl(?:ei?re?|\s*\.?))\s*)?\s*\(\s*\d{4}\D[^)]*?\)"

# Open
regex_o1 = r"(?<=[(;])[^(;\d]*?[A-zÀ-ž][^(;]*?\d{4}[^);]*?(?=[);])"
regex_o2 = r"(?:[^()\s]+\s+){1,5}\(\s*\d{4}[^)]*?\)"

# NOU, StMeld, Prop
regex_nou = r"(?:(?:NOU|nou)\s*\(?\s*\d{4}\s*:\s*\d{1,4}|(?:St\s*\.?\s*)?Meld\s*\.?\s*(?:St\s*\.?\s*)?(?:nr\s*\.?\s*)?\d{1,3}\s*\(?\s*\d{4}\s*-\s*\d{4}\s*\)?|Prop\s*\.?\s*\d{1,3}\s*[A-ZÆØÅ]\s*\(?\s*\d{4}\s*-\s*\d{4}\s*\)?)(?:\s*[,(]?\s*[PpSs]\s*\.?\s*\d{1,4})?"


#### Definere findone

In [37]:
def findone(regx, s):
    res = re.findall(regx, s)
    try:
        r = res[0]
    except:
        r = "itj no"
    return r

#### Lage concs2 som utgangspunkt

In [38]:
concs1['parentes'] = concs1.concordance.apply(lambda x: findone(regex1, x))

In [39]:
concs2 = concs1[concs1['parentes'] != 'itj no']
concs2 = concs2[['urn', 'concordance']]

#### Forberede concs_l for lenient

In [40]:
concs2['parentes'] = concs2.concordance.apply(lambda x: findone(regex_l1, x))

In [41]:
concs_l = concs2[concs2['parentes'] != 'itj no']
concs_l = concs_l[['urn', 'concordance']]

#### Definere match_and_explode

In [42]:
def match_and_explode(c, regex):
    match = []
    
    for i in c.values:
        m = re.findall(regex, i[1])
        if m != []:
            match.append((i[0], m))

    match_df = pd.DataFrame(match)
    match_explode = match_df.explode(column=1)
    
    return match_explode

#### If-elif-else

In [43]:
strictness = 'lenient'

if strictness == 'strict':
    i_parentes = match_and_explode(concs2, regex_s1)
    u_parentes = match_and_explode(concs2, regex_s2)
        
elif strictness == 'moderate':
    i_parentes = match_and_explode(concs2, regex_m1)
    u_parentes = match_and_explode(concs2, regex_m2)
            
elif strictness == 'lenient':                
    i_parentes = match_and_explode(concs_l, regex_l2)
    u_parentes = match_and_explode(concs2, regex_l3)
                
elif strictness == 'open':
    i_parentes = match_and_explode(concs2, regex_o1)
    u_parentes = match_and_explode(concs2, regex_o2)
                    
else:
    print("Strictness argument is not valid")
    


#### Kjøre nou_stmeld_prop

In [44]:
concs2

Unnamed: 0,urn,concordance,parentes
2,URN:NBN:no-nb_pliktmonografi_000001432,"( 2011 ) . Computer-mediated conversation . Introduction and overview . Language@Internet , 8 ( 2 ) .",itj no
7,URN:NBN:no-nb_digibok_2016072848085,"( 1851 ) Additions et éclaircissements ä 1 ’ Histoire de la Géorgie , St . - Pétersbourg .",itj no
12,URN:NBN:no-nb_digibok_2020112748551,"( 2015 ) . Spoken Grammar : Where Are We and Where Are We Going ? Applied Linguistics , 35",itj no
14,URN:NBN:no-nb_digibok_2016081708080,"Mind & Language as Recanati ’ s paper ( Carston 2002 a ) , is the standard position in pragmatics ,",Carston 2002 a )
15,URN:NBN:no-nb_digibok_2012121206240,( 1957 ) indeholdt en første sammenfattende fremstilling af principperne for sproglig og kulturel kontrastering . Den stigende interes-,itj no
16,URN:NBN:no-nb_digibok_2016062408083,"Dietrichson var påvirket av Herbert Spencer ( 1820 - 1903 ) , som han viste til i flere sammenhenger .",itj no
17,URN:NBN:no-nb_digibok_2010052003084,"as the Son of God elsewhere , but he has a point ( 1991 , 626 - 27 ) .",itj no
18,URN:NBN:no-nb_digibok_2010100803012,"et al ( 1993 ) , SlCStus Prolog User ' s Manual Version 2.1 # 8 , SICS Technical Report T",itj no
19,URN:NBN:no-nb_digibok_2011082608096,", og en med kort vokal og geminert konsonant kan fremstilles slik grafisk ( Kristoffersen 1991 : 97 ) :",Kristoffersen 1991 : 97 )
20,URN:NBN:no-nb_digibok_2011102108011,1 ( 1887 ) - 25 ( 1911 ) . Jg . 1926 : 1. Berl . 1888 - 1928.,itj no


In [45]:
noustp = match_and_explode(concs1, regex_nou)

#### Legge sammen og sortere

In [46]:
match_concat = pd.concat([i_parentes, u_parentes, noustp], axis=0, ignore_index=True)
match_sorted = match_concat.sort_values(by=0, ignore_index=True)

In [47]:
match_sorted

Unnamed: 0,0,1
0,URN:NBN:no-nb_digibok_2007022801011,"Åberg , B . ( 1964 )"
1,URN:NBN:no-nb_digibok_2007022801011,Læstadius [ 1840 - 1845 ] 1997 : 16
2,URN:NBN:no-nb_digibok_2007022801011,Jonsell 2000 : 45
3,URN:NBN:no-nb_digibok_2007022801011,"Fellman 1906 , 2 : 7"
4,URN:NBN:no-nb_digibok_2007022801011,"om møte med Læstadius i Karesuando 1839 , s . 270 f ."
5,URN:NBN:no-nb_digibok_2007032301036,Vlamingen ( 1925 )
6,URN:NBN:no-nb_digibok_2007051504031,Grammar ( 1924 )
7,URN:NBN:no-nb_digibok_2007051504031,Rudbeck ( 1675 )
8,URN:NBN:no-nb_digibok_2007051504031,John Ries ( 1894 )
9,URN:NBN:no-nb_digibok_2007051504031,Grammaire Générale ( 1928 )
