In [6]:
import os
from math import sqrt
import nltk
from nltk.corpus import stopwords

In [7]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goras\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [8]:
assets_dir = os.path.realpath("../assets/annotated_corpus")
train_dir = os.path.join(assets_dir, "train")

In [9]:
topics = os.listdir(train_dir)

In [10]:
sentences = []
for t in topics:
    workdir = os.path.join(train_dir, t)
    for filename in os.listdir(workdir):
        with open(os.path.join(workdir, filename)) as f:
            lines = "".join(f.readlines())
            sentences_raw = lines.split("\n\n")
            for s in sentences_raw:
                words = s.split("\n")
                if len(words) == 0 or words[0] == "":
                    continue
                stems_raw = list(map(lambda x: x.split("\t")[1], words))
                lemmas = list(map(lambda x: x.split("\t")[2], words))
                stems = []
                for i in range(len(stems_raw)):
                    if lemmas[i] not in stopwords.words("english"):
                        stems.append(stems_raw[i])
                sentences.append(stems)

In [11]:
len(sentences)

191598

In [12]:
ngram_length = 3

In [13]:
ngrams = []
word_count = {}
ngrams_count = {}
for s in sentences:
    counter = 0
    for w in s:
        if w not in word_count.keys():
            word_count[w] = 0
        word_count[w] += 1
        counter += 1
    if counter < ngram_length:
        continue
    for i in range(len(s) - ngram_length + 1):
        ngram = tuple(s[i:i+ngram_length])
        if ngram not in ngrams_count.keys():
            ngrams_count[ngram] = 0
        ngrams_count[ngram] += 1
        ngrams.append(ngram)

In [14]:
len(ngrams)

1130235

In [15]:
sorted(word_count.items(), key=lambda x: -x[1])[:30]

[('was', 12300),
 ('use', 8687),
 ('one', 8670),
 ('write', 8416),
 ('would', 8281),
 ('has', 7031),
 ('articl', 6878),
 ('like', 5886),
 ('get', 5842),
 ('peopl', 5519),
 ('know', 5374),
 ('x', 5121),
 ('think', 4750),
 ('max', 4597),
 ('time', 4556),
 ('say', 4283),
 ('1', 4221),
 ('go', 4086),
 ('also', 3991),
 ('make', 3988),
 ('doe', 3730),
 ('work', 3650),
 ('year', 3385),
 ('good', 3341),
 ('new', 3304),
 ('system', 3251),
 ('want', 3222),
 ('could', 3203),
 ('right', 3200),
 ('way', 3109)]

In [16]:
sorted(ngrams_count.items(), key=lambda x: -x[1])[:30]

[(('_', '_', '_'), 485),
 (('0', '1', '1'), 336),
 (('1', '0', '1'), 252),
 (('doe', 'anyon', 'know'), 218),
 (('2', '2', '2'), 157),
 (('1', '0', '0'), 140),
 (('0', '0', '0'), 133),
 (('1', '1', '0'), 117),
 (('0', '2', '2'), 100),
 (('7', '7', '7'), 97),
 (('1', '1', '2'), 84),
 (('pl', 'pl', 'pl'), 82),
 (('bank', 'n3jxp', 'skeptic'), 81),
 (('n3jxp', 'skeptic', 'chastiti'), 81),
 (('skeptic', 'chastiti', 'intellect'), 81),
 (('shame', 'surrend', 'soon'), 79),
 (('chastiti', 'intellect', 'shame'), 78),
 (('intellect', 'shame', 'surrend'), 78),
 (('scorer', 'g', 'pts'), 74),
 (('close', 'road', 'mountain'), 74),
 (('road', 'mountain', 'pass'), 74),
 (('proceed', 'work', 'extermin'), 74),
 (('would', 'great', 'appreci'), 72),
 (('p', 'p', 'p'), 72),
 (('soviet', 'armenia', 'today'), 72),
 (('armenia', 'today', 'longer'), 72),
 (('today', 'longer', 'exist'), 72),
 (('longer', 'exist', 'singl'), 72),
 (('exist', 'singl', 'turkish'), 72),
 (('singl', 'turkish', 'soul'), 72)]

In [17]:
total_words = sum(word_count.values())

In [18]:
total_words

1482989

In [19]:
ngram_score = {}
for ngram in set(ngrams):
    count_mul_result = 1
    for word in ngram:
        count_mul_result *= word_count[word]
    ngram_score[ngram] = (ngrams_count[ngram] - (count_mul_result / (total_words ** (ngram_length - 1)))) / sqrt(ngrams_count[ngram])

In [20]:
sorted(ngram_score.items(), key=lambda x: -x[1])[0:30]

[(('_', '_', '_'), 22.02269575380346),
 (('0', '1', '1'), 18.329204061593007),
 (('1', '0', '1'), 15.873239175855426),
 (('doe', 'anyon', 'know'), 14.763420534899202),
 (('2', '2', '2'), 12.529041904853722),
 (('1', '0', '0'), 11.831157081567426),
 (('0', '0', '0'), 11.531956833135402),
 (('1', '1', '0'), 10.814791897725733),
 (('0', '2', '2'), 9.999022942943222),
 (('7', '7', '7'), 9.848803743779651),
 (('1', '1', '2'), 9.162552651699903),
 (('pl', 'pl', 'pl'), 9.055365684545587),
 (('n3jxp', 'skeptic', 'chastiti'), 8.999999956040595),
 (('skeptic', 'chastiti', 'intellect'), 8.999999951156218),
 (('bank', 'n3jxp', 'skeptic'), 8.999999805399222),
 (('shame', 'surrend', 'soon'), 8.888194088466022),
 (('chastiti', 'intellect', 'shame'), 8.831760817693489),
 (('intellect', 'shame', 'surrend'), 8.831760792783209),
 (('scorer', 'g', 'pts'), 8.602324680124548),
 (('road', 'mountain', 'pass'), 8.602321600730907),
 (('close', 'road', 'mountain'), 8.602321123945975),
 (('proceed', 'work', 'exte

In [21]:
from nltk.collocations import  *
from nltk import Text
import nltk

In [22]:
text = []
for s in sentences:
    text += s

In [23]:
finder = TrigramCollocationFinder.from_words(Text(text))

In [24]:
finder.nbest(nltk.collocations.TrigramAssocMeasures().student_t, 30)

[('max', 'max', 'max'),
 ('_', '_', '_'),
 ('0', '1', '1'),
 ('x', 'x', 'x'),
 ('1', '0', '1'),
 ('doe', 'anyon', 'know'),
 ('2', '2', '2'),
 ('1', '0', '0'),
 ('0', '0', '0'),
 ('1', '1', '0'),
 ('0', '2', '2'),
 ('pl', 'pl', 'pl'),
 ('7', '7', '7'),
 ('0d', '0d', '0d'),
 ('1', '1', '2'),
 ('n3jxp', 'skeptic', 'chastiti'),
 ('skeptic', 'chastiti', 'intellect'),
 ('bank', 'n3jxp', 'skeptic'),
 ('shame', 'surrend', 'soon'),
 ('chastiti', 'intellect', 'shame'),
 ('intellect', 'shame', 'surrend'),
 ('max', 'max', 'pl'),
 ('p', 'p', 'p'),
 ('scorer', 'g', 'pts'),
 ('road', 'mountain', 'pass'),
 ('close', 'road', 'mountain'),
 ('proceed', 'work', 'extermin'),
 ('soviet', 'armenia', 'today'),
 ('singl', 'turkish', 'soul'),
 ('armenia', 'today', 'longer')]