In [1]:
import pandas as pd
import re

In [2]:
# read frequncy lists from disk (created using make_frequency_lists.ipynb)
aftenposten_words = pd.read_csv("AP_words.csv").set_index("Unnamed: 0")
klassekampen_words = pd.read_csv("KK_words.csv").set_index("Unnamed: 0")
dagsavisen_words = pd.read_csv("DA_words.csv").set_index("Unnamed: 0")

In [3]:
# inspect one freqlist
aftenposten_words

Unnamed: 0_level_0,freq,relfreq
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
.,4142764,4.898341e-02
",",2754528,3.256912e-02
:,1675850,1.981499e-02
i,1627454,1.924277e-02
og,1370069,1.619949e-02
...,...,...
Teisbæk,1,1.182385e-08
Parafili,1,1.182385e-08
KJBHHBR,1,1.182385e-08
bortfortolke,1,1.182385e-08


In [4]:
# join frequency lists of Aftenposten and Dagsavisen
merged = aftenposten_words.merge(dagsavisen_words, left_index=True, right_index=True, suffixes=["_AP", "_DA"])

In [5]:
merged

Unnamed: 0_level_0,freq_AP,relfreq_AP,freq_DA,relfreq_DA
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
.,4142764,4.898341e-02,2578023,4.500726e-02
",",2754528,3.256912e-02,1892745,3.304364e-02
:,1675850,1.981499e-02,1178465,2.057370e-02
i,1627454,1.924277e-02,1117636,1.951175e-02
og,1370069,1.619949e-02,941387,1.643478e-02
...,...,...,...,...
livsstilsveileder,1,1.182385e-08,1,1.745805e-08
ungdomskjærester,1,1.182385e-08,1,1.745805e-08
Ambi,1,1.182385e-08,1,1.745805e-08
rhard,1,1.182385e-08,1,1.745805e-08


In [6]:
# ignore words with casing and non-word characters
def is_valid_index(idx):
    return isinstance(idx, str) and idx.islower() and bool(re.match(r'^\w+$', idx))

In [7]:
filtered_merged = merged[merged.index.map(is_valid_index)]

In [8]:
filtered_merged

Unnamed: 0_level_0,freq_AP,relfreq_AP,freq_DA,relfreq_DA
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
i,1627454,1.924277e-02,1117636,1.951175e-02
og,1370069,1.619949e-02,941387,1.643478e-02
er,950171,1.123468e-02,661978,1.155685e-02
å,852758,1.008288e-02,555856,9.704163e-03
på,755178,8.929109e-03,523656,9.142014e-03
...,...,...,...,...
defintivt,1,1.182385e-08,1,1.745805e-08
livsstilsveileder,1,1.182385e-08,1,1.745805e-08
ungdomskjærester,1,1.182385e-08,1,1.745805e-08
rhard,1,1.182385e-08,1,1.745805e-08


In [9]:
# divide relative frequencies
filtered_merged["underrep_DA"] = filtered_merged["relfreq_AP"] / filtered_merged["relfreq_DA"]
filtered_merged["underrep_AP"] = filtered_merged["relfreq_DA"] / filtered_merged["relfreq_AP"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_merged["underrep_DA"] = filtered_merged["relfreq_AP"] / filtered_merged["relfreq_DA"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_merged["underrep_AP"] = filtered_merged["relfreq_DA"] / filtered_merged["relfreq_AP"]


In [10]:
# look at underrepresented words in Aftenposten relative to Dagsavisen (ten times more common
filtered_merged[filtered_merged["underrep_AP"] > 10].sort_values(by="freq_DA", ascending=False).head(50)

Unnamed: 0_level_0,freq_AP,relfreq_AP,freq_DA,relfreq_DA,underrep_DA,underrep_AP
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
lør,2601,3.075383e-05,46379,0.00081,0.037982,26.328008
fre,2526,2.986704e-05,43969,0.000768,0.038909,25.701011
søn,2352,2.780969e-05,41751,0.000729,0.038153,26.20997
tors,928,1.097253e-05,38212,0.000667,0.016448,60.797925
ons,977,1.15519e-05,33139,0.000579,0.019967,50.082016
tirs,315,3.724512e-06,32981,0.000576,0.006469,154.593145
fram,1680,1.986406e-05,22814,0.000398,0.049874,20.050681
tida,261,3.086024e-06,5724,0.0001,0.030882,32.381435
uka,343,4.05558e-06,5661,9.9e-05,0.041036,24.368904
sju,614,7.259842e-06,4660,8.1e-05,0.089237,11.206101


In [11]:
# look at underrepresented words in Dagsavisen relative to Aftenposten (ten times more common
filtered_merged[filtered_merged["underrep_DA"] > 10].sort_values(by="freq_AP", ascending=False).head(50)

Unnamed: 0_level_0,freq_AP,relfreq_AP,freq_DA,relfreq_DA,underrep_DA,underrep_AP
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
skyet,36626,0.000433,15,2.618708e-07,1653.71728,0.000605
vind,15334,0.000181,752,1.312845e-05,13.810222,0.07241
stoff,8000,9.5e-05,264,4.608926e-06,20.52339,0.048725
bris,7500,8.9e-05,69,1.204606e-06,73.616507,0.013584
veil,6069,7.2e-05,154,2.68854e-06,26.690668,0.037466
ili,5863,6.9e-05,252,4.399429e-06,15.757321,0.063463
lit,5805,6.9e-05,306,5.342164e-06,12.848246,0.077832
plussgrader,4399,5.2e-05,57,9.95109e-07,52.268753,0.019132
kvm,4276,5.1e-05,123,2.14734e-06,23.544833,0.042472
hverken,3878,4.6e-05,180,3.142449e-06,14.591446,0.068533
