# Bigram Counter
BNC and Lang-8 bigrams are fetched in this section.

In [1]:
! egrep "^([A-Za-z\-]+) ([A-Za-z\-]+)\t" BNC.2w.c.txt > bnc.txt
! head bnc.txt

A -	ZZ0 -	a -	1
A -RNA	ZZ0 AJ0	a -rna	19
A A	AT0 AT0	a a	54
A A	NP0 NP0	a a	12
A A	ZZ0 ZZ0	a a	53
A AND	ZZ0 CJC	a and	2
A Aaby	NP0 NP0-NN1	a aaby	1
A Ability	ZZ0 NN1	a ability	1
A Ackroyd	ZZ0 NP0	a ackroyd	1
A Acky	UNC NP0	a acky	1


In [2]:
from collections import Counter

def get_counter(text):
  counter_ptr = Counter()
  for line in text.split("\n")[:-1]:
    tokens_ptr = line.split("\t")
    bigram = tokens_ptr[2]
    frequency = int(tokens_ptr[3])
    counter_ptr[bigram] = frequency
  return counter_ptr

print(get_counter("British accident\tAJ0 NN1\tbritish accident\t1\nappalling accident\tAJ0 NN1\tappalling accident\t1\n"))

Counter({'british accident': 1, 'appalling accident': 1})


In [3]:
BNC_bigram_counter = get_counter(open("bnc.txt").read())
sum(BNC_bigram_counter.values())

15394941

In [4]:
BNC_bigram_counter.most_common(10)

[('at the', 12086),
 ('on the', 12064),
 ('they be', 8449),
 ('there be', 7227),
 ('in this', 6618),
 ('possible to', 6499),
 ('and then', 5915),
 ('it would', 5507),
 ('some of', 4306),
 ('during the', 3929)]

In [5]:
BNC_bigram_counter["big accident"]

1

In [6]:
import string

def tokenize(text):
  return list(filter(None, text.translate(str.maketrans("", "", string.punctuation)).lower().split(" ")))

print(tokenize("This is an  example. Don't 123 I'll do it p28 ibid, p300 ibid!"))

['this', 'is', 'an', 'example', 'dont', '123', 'ill', 'do', 'it', 'p28', 'ibid', 'p300', 'ibid']


In [7]:
def get_ngram(tokens_ptr, n=2):
  result_ptr = []
  for start_idx in range(len(tokens_ptr) - n + 1):
    ngram = ""
    for i in range(start_idx, start_idx + n):
      if i != start_idx:
        ngram += " "
      ngram += tokens_ptr[i]
    result_ptr.append(ngram)
  return result_ptr

print(get_ngram(tokenize("This is an example."), n=2))

['this is', 'is an', 'an example']


In [8]:
def calculate_frequency(tokens_ptr):
  return Counter(tokens_ptr)

print(calculate_frequency(tokenize("This is an example. This is great!")))

Counter({'this': 2, 'is': 2, 'an': 1, 'example': 1, 'great': 1})


In [9]:
lang_bigram = get_ngram(tokenize(open("clang8.txt").read()), n=2)
lang_bigram_counter = calculate_frequency(lang_bigram)
sum(lang_bigram_counter.values())

24159206

In [10]:
lang_bigram_counter.most_common(10)

[('in the', 69569),
 ('want to', 61005),
 ('of the', 59215),
 ('do nt', 54006),
 ('i have', 46139),
 ('\ni m', 43318),
 ('to the', 41491),
 ('i was', 39953),
 ('i m', 39225),
 ('and i', 38296)]

In [11]:
lang_bigram_counter["big accident"]

21

# Ranking
Get the table out bae.

In [12]:
! egrep "^(.*?) accident\t(AJ0|aj0)" bnc.txt > bnc_accident.txt
! head bnc_accident.txt

British accident	AJ0 NN1	british accident	1
Chernobyl-type accident	AJ0 NN1	chernobyl-type accident	1
Chernobyl-type accident	AJ0-NN1 NN1	chernobyl-type accident	1
FATAL accident	AJ0 NN1	fatal accident	1
Ford-approved accident	AJ0-NN1 NN1	ford-approved accident	1
French accident	AJ0-NN1 NN1	french accident	1
Nasty accident	AJ0 NN1	nasty accident	2
Palazzese accident	AJ0 NN1	palazzese accident	1
Personal accident	AJ0 NN1	personal accident	1
TMI accident	AJ0-NN1 NN1	tmi accident	6


In [13]:
BNC_accident_bigram_counter = get_counter(open("bnc_accident.txt").read())

In [14]:
BNC_accident_bigram_counter.most_common(10)

[('serious accident', 61),
 ('tragic accident', 32),
 ('terrible accident', 26),
 ('major accident', 23),
 ('industrial accident', 18),
 ('bad accident', 16),
 ('unfortunate accident', 16),
 ('nasty accident', 15),
 ('little accident', 15),
 ('horrific accident', 12)]

In [15]:
BNC_accident_bigram_sorted = BNC_accident_bigram_counter.most_common()
BNC_accident_bigram_sorted[0:5]

[('serious accident', 61),
 ('tragic accident', 32),
 ('terrible accident', 26),
 ('major accident', 23),
 ('industrial accident', 18)]

In [16]:
lang_accident_bigram = []
for tuple_ptr in BNC_accident_bigram_sorted:
  lang_accident_bigram.append((tuple_ptr[0], lang_bigram_counter[tuple_ptr[0]]))
lang_accident_bigram[0:5]

[('serious accident', 13),
 ('tragic accident', 4),
 ('terrible accident', 19),
 ('major accident', 1),
 ('industrial accident', 1)]

In [17]:
lang_accident_bigram_sorted = sorted(lang_accident_bigram, key=lambda x: x[1], reverse=True)
lang_accident_bigram_sorted[0:5]

[('nuclear accident', 46),
 ('big accident', 21),
 ('terrible accident', 19),
 ('bad accident', 17),
 ('serious accident', 13)]

In [18]:
def get_bigram_Rank(bigram_sorted):
  bigram_Rank = {}
  for idx in range(len(bigram_sorted)):
    bigram_Rank[bigram_sorted[idx][0]] = idx + 1
  return bigram_Rank

In [19]:
BNC_accident_bigram_Rank = get_bigram_Rank(BNC_accident_bigram_sorted)
print(BNC_accident_bigram_Rank["serious accident"]) # rank 1
print(BNC_accident_bigram_Rank["historical accident"]) # rank 5

1
143


In [20]:
lang_accident_bigram_Rank = get_bigram_Rank(lang_accident_bigram_sorted)
print(lang_accident_bigram_Rank["nuclear accident"]) # rank 1
print(lang_accident_bigram_Rank["serious accident"]) # rank 5

1
5


In [21]:
def get_accident_ranks():
  ranks_ptr = []
  for accident_bigram in BNC_accident_bigram_Rank:
    ranks_ptr.append((accident_bigram,
                      BNC_accident_bigram_Rank[accident_bigram] / lang_accident_bigram_Rank[accident_bigram],
                      BNC_accident_bigram_Rank[accident_bigram],
                      BNC_bigram_counter[accident_bigram] / sum(BNC_bigram_counter.values()) * 1e6,
                      lang_accident_bigram_Rank[accident_bigram],
                      lang_accident_bigram_Rank[accident_bigram] / sum(lang_bigram_counter.values()) * 1e6))
  return ranks_ptr

get_accident_ranks()[0:5]

[('serious accident', 0.2, 1, 0.12991280707084232, 5, 0.20696044398147853),
 ('tragic accident', 0.2, 2, 0.06495640353542116, 10, 0.41392088796295706),
 ('terrible accident', 1.0, 3, 0.06495640353542116, 3, 0.12417626638888712),
 ('major accident',
  0.16666666666666666,
  4,
  1.1042588601021597,
  24,
  0.993410131111097),
 ('industrial accident', 0.2, 5, 1.2341716671730019, 25, 1.0348022199073927)]

In [22]:
import pandas as pd

frame_ptr = pd.DataFrame(get_accident_ranks(), columns=["adj. accident", "Overuse rank/rank", "BNC rank", "BNC ratio (*1m)", "Lang-8 rank", "Lang-8 ratio (*1m)"])
frame_ptr.head(30)

Unnamed: 0,adj. accident,Overuse rank/rank,BNC rank,BNC ratio (*1m),Lang-8 rank,Lang-8 ratio (*1m)
0,serious accident,0.2,1,0.129913,5,0.20696
1,tragic accident,0.2,2,0.064956,10,0.413921
2,terrible accident,1.0,3,0.064956,3,0.124176
3,major accident,0.166667,4,1.104259,24,0.99341
4,industrial accident,0.2,5,1.234172,25,1.034802
5,bad accident,1.5,6,0.064956,4,0.165568
6,unfortunate accident,0.269231,7,0.064956,26,1.076194
7,nasty accident,0.186047,8,0.259826,43,1.77986
8,little accident,0.818182,9,0.064956,11,0.455313
9,horrific accident,0.227273,10,0.259826,44,1.821252


In [23]:
frame_ptr.round(2).head(30)

Unnamed: 0,adj. accident,Overuse rank/rank,BNC rank,BNC ratio (*1m),Lang-8 rank,Lang-8 ratio (*1m)
0,serious accident,0.2,1,0.13,5,0.21
1,tragic accident,0.2,2,0.06,10,0.41
2,terrible accident,1.0,3,0.06,3,0.12
3,major accident,0.17,4,1.1,24,0.99
4,industrial accident,0.2,5,1.23,25,1.03
5,bad accident,1.5,6,0.06,4,0.17
6,unfortunate accident,0.27,7,0.06,26,1.08
7,nasty accident,0.19,8,0.26,43,1.78
8,little accident,0.82,9,0.06,11,0.46
9,horrific accident,0.23,10,0.26,44,1.82
