In [1]:
import pandas as pd
import re

In [2]:
# read frequncy lists from disk (created using make_frequency_lists.ipynb)
aftenposten_words = pd.read_csv("AP_words.csv").set_index("Unnamed: 0")
klassekampen_words = pd.read_csv("KK_words.csv").set_index("Unnamed: 0")
dagsavisen_words = pd.read_csv("DA_words.csv").set_index("Unnamed: 0")

In [3]:
# inspect one freqlist
aftenposten_words

Unnamed: 0_level_0,freq,relfreq
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
.,4142764,4.898341e-02
",",2754528,3.256912e-02
:,1675850,1.981499e-02
i,1627454,1.924277e-02
og,1370069,1.619949e-02
...,...,...
Teisbæk,1,1.182385e-08
Parafili,1,1.182385e-08
KJBHHBR,1,1.182385e-08
bortfortolke,1,1.182385e-08


In [4]:
# join frequency lists of Aftenposten and Dagsavisen
merged = aftenposten_words.merge(dagsavisen_words, left_index=True, right_index=True, suffixes=["_AP", "_DA"])

In [5]:
merged

Unnamed: 0_level_0,freq_AP,relfreq_AP,freq_DA,relfreq_DA
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
.,4142764,4.898341e-02,2578023,4.500726e-02
",",2754528,3.256912e-02,1892745,3.304364e-02
:,1675850,1.981499e-02,1178465,2.057370e-02
i,1627454,1.924277e-02,1117636,1.951175e-02
og,1370069,1.619949e-02,941387,1.643478e-02
...,...,...,...,...
livsstilsveileder,1,1.182385e-08,1,1.745805e-08
ungdomskjærester,1,1.182385e-08,1,1.745805e-08
Ambi,1,1.182385e-08,1,1.745805e-08
rhard,1,1.182385e-08,1,1.745805e-08


In [6]:
# ignore words with casing and non-word characters
def is_valid_index(idx):
    return isinstance(idx, str) and idx.islower() and bool(re.match(r'^\w+$', idx))

In [7]:
filtered_merged = merged[merged.index.map(is_valid_index)]

In [8]:
filtered_merged

Unnamed: 0_level_0,freq_AP,relfreq_AP,freq_DA,relfreq_DA
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
i,1627454,1.924277e-02,1117636,1.951175e-02
og,1370069,1.619949e-02,941387,1.643478e-02
er,950171,1.123468e-02,661978,1.155685e-02
å,852758,1.008288e-02,555856,9.704163e-03
på,755178,8.929109e-03,523656,9.142014e-03
...,...,...,...,...
defintivt,1,1.182385e-08,1,1.745805e-08
livsstilsveileder,1,1.182385e-08,1,1.745805e-08
ungdomskjærester,1,1.182385e-08,1,1.745805e-08
rhard,1,1.182385e-08,1,1.745805e-08


In [9]:
# divide relative frequencies
filtered_merged["underrep_DA"] = filtered_merged["relfreq_AP"] / filtered_merged["relfreq_DA"]
filtered_merged["underrep_AP"] = filtered_merged["relfreq_DA"] / filtered_merged["relfreq_AP"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_merged["underrep_DA"] = filtered_merged["relfreq_AP"] / filtered_merged["relfreq_DA"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_merged["underrep_AP"] = filtered_merged["relfreq_DA"] / filtered_merged["relfreq_AP"]


In [32]:
# look at underrepresented words in Aftenposten relative to Dagsavisen (ten times more common
filtered_merged[filtered_merged["underrep_AP"] > 10].sort_values(by="freq_DA", ascending=False).to_excel("AP_underrep.xlsx")

In [31]:
# look at underrepresented words in Dagsavisen relative to Aftenposten (ten times more common
filtered_merged[filtered_merged["underrep_DA"] > 10].sort_values(by="freq_AP", ascending=False).to_excel("DA_underrep.xlsx")