In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter

from typing import List

from string import punctuation
from nltk import word_tokenize
from nltk.corpus import stopwords
from pymorphy2 import MorphAnalyzer

from sklearn.feature_extraction.text import TfidfVectorizer

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_colwidth', 1000)

punctuation += '«»—…“”*№–'
stopwords = set(stopwords.words('russian'))

# Data

In [2]:
PATH_TO_DATA = './data'
files = [os.path.join(PATH_TO_DATA, file) for file in os.listdir(PATH_TO_DATA) if file.endswith('.zip')]

In [3]:
data = pd.concat([pd.read_json(file, lines=True) for file in files], axis=0, ignore_index=True)

In [4]:
data.shape

(17266, 6)

## Normalization

In [5]:
class Lemmatizer:
    def __init__(self):
        self.morph = MorphAnalyzer()
        self.cache = {}
    
    def lemmatize(self, token):
        if token in self.cache:
            return self.cache[token]
        norm = self.morph.parse(token)[0]
        norm = (norm.normal_form, norm.tag.POS)
        self.cache[token] = norm
        return norm

lemmatizer = Lemmatizer()

In [6]:
def normalize(text: str) -> List[str]:
    output = []
    for token in word_tokenize(text):
        token = token.strip(punctuation)
        if not token:
            continue
            
        lemma, pos_tag = lemmatizer.lemmatize(token)
        if lemma in stopwords or pos_tag != 'NOUN':
            continue
            
        output.append(lemma)
    
    return output

In [7]:
%%time

data['title_norm'] = data['title'].apply(normalize)
data['content_norm'] = data['content'].apply(normalize)

CPU times: user 5min 3s, sys: 232 ms, total: 5min 4s
Wall time: 5min 4s


In [8]:
data[['title_norm']].head(10)

Unnamed: 0,title_norm
0,"[будущее, россия, конвертоплан]"
1,"[россия, чемпионат, мир, футболист, зенит, краневиттер, месси, жизнь, петербург]"
2,"[россия, запад, автоматизация, управление, оборона]"
3,"[открытие, богатство, месторождение, газа, средиземноморье, конфликт, ближний, восток]"
4,"[дорога, украина, принятие, закон, реинтеграция, донбасс]"
5,"[переворот, сознание, черногория, россия, попытка, смена, власть, страна]"
6,"[топливо, украина, контроль, система, европеец]"
7,"[фронт, кампания, марин, пена, власть]"
8,"[оружие, цска, плей-офф, лига, чемпион]"
9,"[новичок, багаж, великобритания, версия, отравление]"


# Experiments

## Evaluation metrics
We will use the same exact metric computation function that was used in class — for comparability.

In [9]:
def evaluate(true_kws, predicted_kws):
    assert len(true_kws) == len(predicted_kws)
    
    precisions = []
    recalls = []
    f1s = []
    jaccards = []
    
    for i in range(len(true_kws)):
        true_kw = set(true_kws[i])
        predicted_kw = set(predicted_kws[i])
        
        tp = len(true_kw & predicted_kw)
        union = len(true_kw | predicted_kw)
        fp = len(predicted_kw - true_kw)
        fn = len(true_kw - predicted_kw)
        
        if (tp+fp) == 0:
            prec = 0
        else:
            prec = tp / (tp + fp)
        
        if (tp+fn) == 0:
            rec = 0
        else:
            rec = tp / (tp + fn)
        if (prec+rec) == 0:
            f1 = 0
        else:
            f1 = (2*(prec*rec))/(prec+rec)
            
        jac = tp / union
        
        precisions.append(prec)
        recalls.append(rec)
        f1s.append(f1)
        jaccards.append(jac)
    
    print('Precision - ', round(np.mean(precisions), 2))
    print('Recall - ', round(np.mean(recalls), 2))
    print('F1 - ', round(np.mean(f1s), 2))
    print('Jaccard - ', round(np.mean(jaccards), 2))

## TF-IDF baseline

In [17]:
dummy = lambda x: x
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=5, preprocessor=dummy, tokenizer=dummy)
tfidf_matrix = tfidf.fit_transform(data['content_norm'])

In [26]:
id2word = {i: word for i, word in enumerate(tfidf.get_feature_names())}
keywords = [[id2word[w] for w in row.toarray()[0, :].argsort()[-10:]] for row in tfidf_matrix]

In [27]:
evaluate(data['keywords'], keywords)

Precision -  0.09
Recall -  0.12
F1 -  0.09
Jaccard -  0.05


## TF-IDF 1-grams

In [29]:
tfidf = TfidfVectorizer(min_df=5, preprocessor=dummy, tokenizer=dummy)
tfidf_matrix = tfidf.fit_transform(data['content_norm'])

id2word = {i: word for i, word in enumerate(tfidf.get_feature_names())}
keywords = [[id2word[w] for w in row.toarray()[0, :].argsort()[-10:]] for row in tfidf_matrix]

evaluate(data['keywords'], keywords)

Precision -  0.1
Recall -  0.13
F1 -  0.1
Jaccard -  0.06


We were able to improve the metrics by only using 1-grams. Let's stick to this from now on.

## TF-IDF 1-grams on Title + Content
Let's try applying TF-IDF to a concatenation of the title and the content.

In [31]:
data['all_norm'] = data.apply(lambda x: x.title_norm + x.content_norm, axis=1)

tfidf = TfidfVectorizer(min_df=5, preprocessor=dummy, tokenizer=dummy)
tfidf_matrix = tfidf.fit_transform(data['all_norm'])

id2word = {i: word for i, word in enumerate(tfidf.get_feature_names())}
keywords = [[id2word[w] for w in row.toarray()[0, :].argsort()[-10:]] for row in tfidf_matrix]

evaluate(data['keywords'], keywords)

Precision -  0.1
Recall -  0.13
F1 -  0.11
Jaccard -  0.06


We did get one extra point in F1. Great!

## VoteRank

In [32]:
import networkx as nx
from itertools import combinations

In [44]:
def build_matrix(text: List[str], window_size: int = 5):
    vocab = set(text)
    word2id = {w: i for i, w in enumerate(vocab)}
    id2word = {i: w for i, w in enumerate(vocab)}
    ids = [word2id[word] for word in text]

    m = np.zeros((len(vocab), len(vocab)))

    for i in range(0, len(ids), window_size):
        window = ids[i: i+window_size]
        for j, k in combinations(window, 2):
            m[j][k] += 1
            m[k][j] += 1
    
    return m, id2word


def voterank(text: List[str], window_size: int = 5, topn: int = 5):
    matrix, id2word = build_matrix(text, window_size)
    G = nx.from_numpy_array(matrix)
    node2measure = nx.voterank(G)
    
    return [id2word[index] for index in node2measure[:topn]]

In [47]:
%%time

keywords_nx = data['all_norm'].apply(lambda x: voterank(x, 10, 10))

CPU times: user 1h 46min 54s, sys: 76 ms, total: 1h 46min 54s
Wall time: 1h 46min 55s


In [48]:
evaluate(data['keywords'], keywords_nx)

Precision -  0.1
Recall -  0.13
F1 -  0.1
Jaccard -  0.06


Using graph algorithms, we were able to beat the baseline, but not the previous best result. It also took a gigantic amount of time to compute everything, so playing with hyperparameters, although potentially useful, is not feasible in reasonable amounts of time.