# Bigram Counter
BNC and Lang-8 bigrams are fetched in this section.

In [1]:
! egrep "^([A-Za-z\-]+) ([A-Za-z\-]+)\t" BNC.2w.c.txt > bnc.txt
! head bnc.txt

A A	NP0 NP0	a a	3
A A	ZZ0 ZZ0	a a	1
A Adebayo	ZZ0 NP0-NN1	a adebayo	2
A Allan	ZZ0 NP0	a allan	1
A Allcock	ZZ0 NP0	a allcock	1
A Aubin	ZZ0 NN1-NP0	a aubin	1
A BABY	AT0 NN1	a baby	1
A BABYSITTER	AT0 NN1	a babysitter	1
A BACHELOR	AT0 NN1-NP0	a bachelor	1
A BAD	AT0 AJ0	a bad	1


In [2]:
from collections import Counter

def get_counter(text):
  counter_ptr = Counter()
  for line in text.split("\n")[:-1]:
    tokens_ptr = line.split("\t")
    bigram = tokens_ptr[2]
    frequency = int(tokens_ptr[3])
    counter_ptr[bigram] = frequency
  return counter_ptr

print(get_counter("British accident\tAJ0 NN1\tbritish accident\t1\nappalling accident\tAJ0 NN1\tappalling accident\t1\n"))

Counter({'british accident': 1, 'appalling accident': 1})


In [3]:
BNC_bigram_counter = get_counter(open("bnc.txt").read())
sum(BNC_bigram_counter.values())

3716195

In [4]:
BNC_bigram_counter.most_common(10)

[('would have', 4191),
 ('but the', 2268),
 ('cent of', 2026),
 ('at the', 2019),
 ('on the', 1764),
 ('will have', 1754),
 ('of you', 1466),
 ('he to', 1329),
 ('if the', 1312),
 ('they be', 1299)]

In [5]:
BNC_bigram_counter["big accident"]

1

In [6]:
import string

def tokenize(text):
  return list(filter(None, text.translate(str.maketrans("", "", string.punctuation)).lower().split(" ")))

print(tokenize("This is an  example. Don't 123 I'll do it p28 ibid, p300 ibid!"))

['this', 'is', 'an', 'example', 'dont', '123', 'ill', 'do', 'it', 'p28', 'ibid', 'p300', 'ibid']


In [7]:
def get_ngram(tokens_ptr, n=2):
  result_ptr = []
  for start_idx in range(len(tokens_ptr) - n + 1):
    ngram = ""
    for i in range(start_idx, start_idx + n):
      if i != start_idx:
        ngram += " "
      ngram += tokens_ptr[i]
    result_ptr.append(ngram)
  return result_ptr

print(get_ngram(tokenize("This is an example."), n=2))

['this is', 'is an', 'an example']


In [8]:
def calculate_frequency(tokens_ptr):
  return Counter(tokens_ptr)

print(calculate_frequency(tokenize("This is an example. This is great!")))

Counter({'this': 2, 'is': 2, 'an': 1, 'example': 1, 'great': 1})


In [9]:
lang_bigram = get_ngram(tokenize(open("clang8.txt").read()), n=2)
lang_bigram_counter = calculate_frequency(lang_bigram)
sum(lang_bigram_counter.values())

24159206

In [10]:
lang_bigram_counter.most_common(10)

[('in the', 69569),
 ('want to', 61005),
 ('of the', 59215),
 ('do nt', 54006),
 ('i have', 46139),
 ('\ni m', 43318),
 ('to the', 41491),
 ('i was', 39953),
 ('i m', 39225),
 ('and i', 38296)]

In [11]:
lang_bigram_counter["big accident"]

21

# Ranking
Get the table out bae.

In [12]:
! egrep "^(.*?) accident\t(AJ0|aj0)" bnc.txt > bnc_accident.txt
! head bnc_accident.txt

British accident	AJ0 NN1	british accident	1
appalling accident	AJ0 NN1	appalling accident	1
awful accident	AJ0 NN1	awful accident	1
bad accident	AJ0 NN1	bad accident	1
big accident	AJ0 NN1	big accident	1
catastrophic accident	AJ0 NN1	catastrophic accident	2
cerebrovascular accident	AJ0 NN1	cerebrovascular accident	1
collusive accident	AJ0 NN1	collusive accident	1
common accident	AJ0 NN1	common accident	1
complete accident	AJ0 NN1	complete accident	1


In [13]:
BNC_accident_bigram_counter = get_counter(open("bnc_accident.txt").read())

In [14]:
BNC_accident_bigram_counter.most_common(10)

[('serious accident', 20),
 ('fatal accident', 8),
 ('nuclear accident', 7),
 ('personal accident', 7),
 ('historical accident', 5),
 ('disabling accident', 3),
 ('industrial accident', 3),
 ('lucky accident', 3),
 ('mere accident', 3),
 ('catastrophic accident', 2)]

In [15]:
BNC_accident_bigram_sorted = BNC_accident_bigram_counter.most_common()
BNC_accident_bigram_sorted[0:5]

[('serious accident', 20),
 ('fatal accident', 8),
 ('nuclear accident', 7),
 ('personal accident', 7),
 ('historical accident', 5)]

In [16]:
lang_accident_bigram = []
for tuple_ptr in BNC_accident_bigram_sorted:
  lang_accident_bigram.append((tuple_ptr[0], lang_bigram_counter[tuple_ptr[0]]))
lang_accident_bigram[0:5]

[('serious accident', 13),
 ('fatal accident', 6),
 ('nuclear accident', 46),
 ('personal accident', 0),
 ('historical accident', 1)]

In [17]:
lang_accident_bigram_sorted = sorted(lang_accident_bigram, key=lambda x: x[1], reverse=True)
lang_accident_bigram_sorted[0:5]

[('nuclear accident', 46),
 ('big accident', 21),
 ('terrible accident', 19),
 ('bad accident', 17),
 ('serious accident', 13)]

In [18]:
def get_bigram_Rank(bigram_sorted):
  bigram_Rank = {}
  for idx in range(len(bigram_sorted)):
    bigram_Rank[bigram_sorted[idx][0]] = idx + 1
  return bigram_Rank

In [19]:
BNC_accident_bigram_Rank = get_bigram_Rank(BNC_accident_bigram_sorted)
print(BNC_accident_bigram_Rank["serious accident"]) # rank 1
print(BNC_accident_bigram_Rank["historical accident"]) # rank 5

1
5


In [20]:
lang_accident_bigram_Rank = get_bigram_Rank(lang_accident_bigram_sorted)
print(lang_accident_bigram_Rank["nuclear accident"]) # rank 1
print(lang_accident_bigram_Rank["serious accident"]) # rank 5

1
5


In [21]:
def get_accident_ranks():
  ranks_ptr = []
  for accident_bigram in BNC_accident_bigram_Rank:
    ranks_ptr.append((accident_bigram,
                      BNC_accident_bigram_Rank[accident_bigram] / lang_accident_bigram_Rank[accident_bigram],
                      BNC_accident_bigram_Rank[accident_bigram],
                      BNC_bigram_counter[accident_bigram] / sum(BNC_bigram_counter.values()) * 1e6,
                      lang_accident_bigram_Rank[accident_bigram],
                      lang_accident_bigram_Rank[accident_bigram] / sum(lang_bigram_counter.values()) * 1e6))
  return ranks_ptr

get_accident_ranks()[0:5]

[('serious accident', 0.2, 1, 3.229109344369711, 5, 0.20696044398147853),
 ('fatal accident',
  0.3333333333333333,
  2,
  0.2690924453641426,
  6,
  0.24835253277777425),
 ('nuclear accident', 3.0, 3, 0.8072773360924278, 1, 0.041392088796295703),
 ('personal accident',
  0.14814814814814814,
  4,
  0.2690924453641426,
  27,
  1.117586397499984),
 ('historical accident',
  0.2777777777777778,
  5,
  0.8072773360924278,
  18,
  0.7450575983333227)]

In [22]:
import pandas as pd

frame_ptr = pd.DataFrame(get_accident_ranks(), columns=["adj. accident", "Overuse rank/rank", "BNC rank", "BNC ratio (*1m)", "Lang-8 rank", "Lang-8 ratio (*1m)"])
frame_ptr.head(30)

Unnamed: 0,adj. accident,Overuse rank/rank,BNC rank,BNC ratio (*1m),Lang-8 rank,Lang-8 ratio (*1m)
0,serious accident,0.2,1,3.229109,5,0.20696
1,fatal accident,0.333333,2,0.269092,6,0.248353
2,nuclear accident,3.0,3,0.807277,1,0.041392
3,personal accident,0.148148,4,0.269092,27,1.117586
4,historical accident,0.277778,5,0.807277,18,0.745058
5,disabling accident,0.214286,6,0.807277,28,1.158978
6,industrial accident,0.368421,7,1.07637,19,0.78645
7,lucky accident,0.8,8,0.269092,10,0.413921
8,mere accident,0.45,9,0.807277,20,0.827842
9,catastrophic accident,0.47619,10,0.269092,21,0.869234


In [23]:
frame_ptr.round(2).head(30)

Unnamed: 0,adj. accident,Overuse rank/rank,BNC rank,BNC ratio (*1m),Lang-8 rank,Lang-8 ratio (*1m)
0,serious accident,0.2,1,3.23,5,0.21
1,fatal accident,0.33,2,0.27,6,0.25
2,nuclear accident,3.0,3,0.81,1,0.04
3,personal accident,0.15,4,0.27,27,1.12
4,historical accident,0.28,5,0.81,18,0.75
5,disabling accident,0.21,6,0.81,28,1.16
6,industrial accident,0.37,7,1.08,19,0.79
7,lucky accident,0.8,8,0.27,10,0.41
8,mere accident,0.45,9,0.81,20,0.83
9,catastrophic accident,0.48,10,0.27,21,0.87
