<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Import-file" data-toc-modified-id="Import-file-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Import file</a></span></li><li><span><a href="#Stopwords" data-toc-modified-id="Stopwords-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Stopwords</a></span></li><li><span><a href="#Function-for-collocation-extraction" data-toc-modified-id="Function-for-collocation-extraction-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Function for collocation extraction</a></span></li><li><span><a href="#Run-this" data-toc-modified-id="Run-this-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Run this</a></span></li><li><span><a href="#Comparion-with-single-word-collocations" data-toc-modified-id="Comparion-with-single-word-collocations-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Comparion with single-word collocations</a></span></li></ul></div>

In [1]:
import nltk
from nltk.collocations import *
from nltk import FreqDist
from nltk import ngrams
from tqdm.notebook import tqdm 
import re
import os
import pandas as pd

In [2]:
file_path = '../data/Segmentation/2-word_dic_QTS.txt'

# Import file

In [3]:
#Get corpus
corpus = open(file_path).read()

#Get tokens
tokens = nltk.wordpunct_tokenize(corpus)

# Stopwords

In [4]:
#Stoplist with only punctuation
punctuation = {',', '.', '。', '，'}

#List by Slingerland et al.

stop_file = open('../data/Slingerland_stopwords.txt')
stopw = [line.strip() for line in stop_file]
stopw.extend(punctuation)
slingerland = set(stopw)

stopwords = {'punctuation': {',', '.', '。', '，'}, 'slingerland': slingerland}

# Function for collocation extraction

In [15]:
def get_collocats(word, tokens, ngram='bigram', stwords='punctuation', min_freq=3, window=5, nmax=20, results='long'):
    '''
    This will get a list of collocations.

    Args:
        stwords: can be None, punctuation or slingerland (or just any set)
    '''
    #Check length of n-grams
    if ngram == 'bigram':
        bigram_measures = nltk.collocations.BigramAssocMeasures()
        finder = BigramCollocationFinder.from_words(tokens, window_size=window)
    else:
        trigram_measures = nltk.collocations.TrigramAssocMeasures()
        finder = TrigramCollocationFinder.from_words(tokens, window_size=window)
        
    #Filter stopwords
    if stwords != None:
        stwords_set = stopwords[stwords]
        finder.apply_word_filter(lambda w: w in stwords_set)
        
    #Filter the search term
    if word != None:
        word_filter = lambda *w: word not in w
        finder.apply_ngram_filter(word_filter)
    
    #Set minimum n-gram frequency
    if min_freq !=None:
        finder.apply_freq_filter(min_freq)
    
    print (f'Results for "{word}", stoplist="{stwords}", min_freq={min_freq}, window={window}\n')
    print ('=========================================\n')
    
    #Find the collocations
    print ('Printing measures')
    measures = {'raw_freq': bigram_measures.raw_freq, 'pmi': bigram_measures.pmi, 'student_t': bigram_measures.student_t, 'likelihood_ratio': bigram_measures.likelihood_ratio, 'chi_sq': bigram_measures.chi_sq}
    
    for measure in measures:
        print (measure.upper())
        if results == 'short':
            for i in finder.score_ngrams(measures[measure])[:nmax]:
                print (i)
        else:
            for i in finder.score_ngrams(measures[measure])[:nmax]:
                
                try:
                    print (f"{i[0]}, f1: {FreqDist(tokens)[i[0][0]]}, f2: {FreqDist(tokens)[i[0][1]]}, score: {i[1] if i[1]<0.01 else '%.2f' % i[1]}")
                except:
                    print (i)
        print ('=========================================\n')


# Run this

In [26]:
# Word frequency

FreqDist(tokens)['仙']

2012

In [17]:
get_collocats(word='仙', tokens=tokens, ngram='bigram', stwords='punctuation', min_freq=None, window=5, nmax=50, results='long')

Results for "仙", stoplist="punctuation", min_freq=None, window=5


got to measures
RAW_FREQ
('仙', '家'), f1: 2012, f2: 2763, score: 8.220715972135448e-06
('有', '仙'), f1: 8742, f2: 2012, score: 6.673287083262893e-06
('仙', '山'), f1: 2012, f2: 9227, score: 5.8995726388266155e-06
('仙', '掌'), f1: 2012, f2: 336, score: 5.6094297221630116e-06
('仙', '去'), f1: 2012, f2: 6191, score: 5.029143888835804e-06
('得', '仙'), f1: 4393, f2: 2012, score: 5.029143888835804e-06
('仙', '宮'), f1: 2012, f2: 1521, score: 4.54557236106313e-06
('群', '仙'), f1: 1225, f2: 2012, score: 4.448858055508596e-06
('仙', '翁'), f1: 2012, f2: 603, score: 4.352143749954061e-06
('仙', '舟'), f1: 2012, f2: 1141, score: 4.062000833290456e-06
('仙', '路'), f1: 2012, f2: 3479, score: 3.965286527735922e-06
('中', '仙'), f1: 7231, f2: 2012, score: 3.675143611072318e-06
('是', '仙'), f1: 4177, f2: 2012, score: 3.675143611072318e-06
('靈', '仙'), f1: 1192, f2: 2012, score: 3.675143611072318e-06
('仙', '來'), f1: 2012, f2: 7912, score: 3.57842930551778

('仙', '駕'), f1: 2012, f2: 416, score: 20.88
('仙', '桂'), f1: 2012, f2: 746, score: 20.85
('仙', '馭'), f1: 2012, f2: 172, score: 19.54
('仙', '蹕'), f1: 2012, f2: 46, score: 18.62
('仙', '槎'), f1: 2012, f2: 142, score: 17.50
('真', '仙'), f1: 1291, f2: 2012, score: 16.21
('得', '仙'), f1: 4393, f2: 2012, score: 15.63
('仙', '都'), f1: 2012, f2: 668, score: 14.86
('愛神', '仙'), f1: 6, f2: 2012, score: 14.74
('仙', '術'), f1: 2012, f2: 273, score: 14.51
('尋', '仙'), f1: 1785, f2: 2012, score: 13.11
('得神', '仙'), f1: 21, f2: 2012, score: 13.04
('仙', '蹤'), f1: 2012, f2: 286, score: 12.77
('仙', '台'), f1: 2012, f2: 2286, score: 12.41
('仙', '路'), f1: 2012, f2: 3479, score: 12.25
('仙', '媛'), f1: 2012, f2: 14, score: 11.96
('有', '仙'), f1: 8742, f2: 2012, score: 11.27
('九', '仙'), f1: 1164, f2: 2012, score: 10.68
('洞', '仙'), f1: 678, f2: 2012, score: 10.31
('訪', '仙'), f1: 692, f2: 2012, score: 10.16
('望', '仙'), f1: 2945, f2: 2012, score: 9.90
('仙', '輿'), f1: 2012, f2: 176, score: 9.83
('仙', '觀'), f1: 2012, f2: 724

Running collocation search with minimum frequency cut-off and a smaller window trying to improve the MI score.

In [11]:
get_collocats(word='仙', tokens=tokens, ngram='bigram', stwords='punctuation', min_freq=3, window=2, nmax=50, results='long')

Results for "仙", stoplist="punctuation", min_freq=3, window=2


got to measures
RAW_FREQ
('仙', '家'), f1: 2012, f2: 2763, score: 3.133543499966924e-05
('仙', '掌'), f1: 2012, f2: 336, score: 2.166400444421577e-05
('仙', '山'), f1: 2012, f2: 9227, score: 1.856914666647066e-05
('仙', '翁'), f1: 2012, f2: 603, score: 1.7021717777598106e-05
('群', '仙'), f1: 1225, f2: 2012, score: 1.7021717777598106e-05
('仙', '宮'), f1: 2012, f2: 1521, score: 1.6634860555379967e-05
('仙', '舟'), f1: 2012, f2: 1141, score: 1.5087431666507411e-05
('有', '仙'), f1: 8742, f2: 2012, score: 1.4700574444289272e-05
('仙', '侶'), f1: 2012, f2: 477, score: 1.2379431110980439e-05
('得', '仙'), f1: 4393, f2: 2012, score: 1.2379431110980439e-05
('靈', '仙'), f1: 1192, f2: 2012, score: 1.1605716666544162e-05
('仙', '路'), f1: 2012, f2: 3479, score: 1.1218859444326023e-05
('學', '仙'), f1: 1069, f2: 2012, score: 1.1218859444326023e-05
('求', '仙'), f1: 1148, f2: 2012, score: 1.1218859444326023e-05
('仙', '桂'), f1: 2012, f2: 746, score: 1.044514499

('有神', '仙'), f1: 79, f2: 2012, score: 138.17
('仙', '掖'), f1: 2012, f2: 121, score: 134.83
('仙', '馭'), f1: 2012, f2: 172, score: 122.97
('仙', '駕'), f1: 2012, f2: 416, score: 118.41
('真', '仙'), f1: 1291, f2: 2012, score: 113.45
('仙', '都'), f1: 2012, f2: 668, score: 107.79
('仙', '蹕'), f1: 2012, f2: 46, score: 106.99
('仙', '山'), f1: 2012, f2: 9227, score: 101.75
('仙', '台'), f1: 2012, f2: 2286, score: 91.57
('仙', '槎'), f1: 2012, f2: 142, score: 89.81
('得', '仙'), f1: 4393, f2: 2012, score: 86.56
('愛神', '仙'), f1: 6, f2: 2012, score: 85.92
('仙', '路'), f1: 2012, f2: 3479, score: 85.49
('望', '仙'), f1: 2945, f2: 2012, score: 84.28
('仙', '蹤'), f1: 2012, f2: 286, score: 80.86
('仙', '觀'), f1: 2012, f2: 724, score: 76.65
('九', '仙'), f1: 1164, f2: 2012, score: 73.82
('得神', '仙'), f1: 21, f2: 2012, score: 73.53
('有', '仙'), f1: 8742, f2: 2012, score: 68.93
('仙', '輿'), f1: 2012, f2: 176, score: 66.69
('尋', '仙'), f1: 1785, f2: 2012, score: 64.50
('仙', '術'), f1: 2012, f2: 273, score: 57.86
('仙', '禽'), f1: 2

# Comparion with single-word collocations

In [12]:
file_path = '../data/Segmentation/single_char.txt'

In [13]:
#Get corpus
corpus = open(file_path).read()

#Get tokens
tokens = nltk.wordpunct_tokenize(corpus)

In [16]:
get_collocats(word='仙', tokens=tokens, ngram='bigram', stwords='punctuation', min_freq=None, window=5, nmax=50, results='long')

Results for "仙", stoplist="punctuation", min_freq=None, window=5


Printing measures
RAW_FREQ
('神', '仙'), f1: 2919, f2: 3493, score: 3.171182386795327e-05
('仙', '人'), f1: 3493, f2: 20976, score: 2.133192275653558e-05
('有', '仙'), f1: 12674, f2: 3493, score: 1.185106819807532e-05
('是', '仙'), f1: 6457, f2: 3493, score: 1.021643810178907e-05
('上', '仙'), f1: 11102, f2: 3493, score: 9.56258606327457e-06
('仙', '不'), f1: 3493, f2: 26441, score: 9.317391548831632e-06
('仙', '家'), f1: 3493, f2: 6013, score: 9.317391548831632e-06
('得', '仙'), f1: 7322, f2: 3493, score: 7.846224462174006e-06
('仙', '山'), f1: 3493, f2: 16052, score: 7.764492957359693e-06
('仙', '來'), f1: 3493, f2: 12340, score: 7.110640918845193e-06
('仙', '雲'), f1: 3493, f2: 13349, score: 7.0289094140308805e-06
('仙', '子'), f1: 3493, f2: 6215, score: 6.783714899587942e-06
('人', '仙'), f1: 20976, f2: 3493, score: 6.538520385145005e-06
('中', '仙'), f1: 11406, f2: 3493, score: 6.456788880330692e-06
('仙', '去'), f1: 3493, f2: 7775, score: 6.45

('仙', '骨'), f1: 3493, f2: 1100, score: 31.69
('仙', '壇'), f1: 3493, f2: 540, score: 30.29
('群', '仙'), f1: 1570, f2: 3493, score: 27.24
('仙', '馭'), f1: 3493, f2: 198, score: 26.33
('仙', '駕'), f1: 3493, f2: 607, score: 24.62
('洞', '仙'), f1: 1586, f2: 3493, score: 23.22
('有', '仙'), f1: 12674, f2: 3493, score: 23.19
('求', '仙'), f1: 1310, f2: 3493, score: 21.06
('靈', '仙'), f1: 1899, f2: 3493, score: 19.78
('仙', '洞'), f1: 3493, f2: 1586, score: 19.60
('仙', '術'), f1: 3493, f2: 324, score: 19.52
('仙', '境'), f1: 3493, f2: 624, score: 19.49
('得', '仙'), f1: 7322, f2: 3493, score: 19.44
('仙', '掖'), f1: 3493, f2: 186, score: 19.01
('仙', '舟'), f1: 3493, f2: 2260, score: 18.89
('真', '仙'), f1: 1946, f2: 3493, score: 18.50
('尋', '仙'), f1: 2313, f2: 3493, score: 18.44
('仙', '槎'), f1: 3493, f2: 189, score: 17.35
('仙', '子'), f1: 3493, f2: 6215, score: 17.30
('仙', '宮'), f1: 3493, f2: 3096, score: 16.31
('上', '仙'), f1: 11102, f2: 3493, score: 15.87
('仙', '藥'), f1: 3493, f2: 1353, score: 15.12
('仙', '窟'), f1: