#### Lab 7 
#### Rouge Metrics

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip install fuzzywuzzy



In [0]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
import numpy as np
from fuzzywuzzy import fuzz 
import ipywidgets as widgets
import pprint
from ipywidgets import interact, interact_manual
import re
__PATH__ = "/content/drive/My Drive/Lab 7 - Summarization Metrics/data.csv"



In [0]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [0]:
df = pd.read_csv(__PATH__,sep=";",header=0)

In [0]:
df.head()

Unnamed: 0,id,updatedDate,publishedDate,title,summary,authors,category,metaData,downloadLink,filePath
0,http://arxiv.org/abs/1407.6950v1,2014-07-24T16:56:39Z,2014-07-24T16:56:39Z,"How,whenAndHowMuchACardDeckIsWellShuffled.pdf",The Thesis Consider The Mixing Of Few 3 4 ...,Benjamin Isac Fargion,cs.DM,"Italian Thesis In Engeenering Computer, 26 Feb...",http://arxiv.org/pdf/1407.6950v1.pdf,"./files/How,whenAndHowMuchACardDeckIsWellShuff..."
1,http://arxiv.org/abs/0907.0618v1,2009-07-03T12:35:10Z,2009-07-03T12:35:10Z,QuantumIsometryGroups.pdf,This Thesis Contains The Formulation And Com...,Jyotishman Bhowmick,math.OA,Thesis,http://arxiv.org/pdf/0907.0618v1.pdf,./files/QuantumIsometryGroups.pdf
2,http://arxiv.org/abs/1806.09601v2,2018-07-14T17:06:27Z,2018-06-25T17:55:59Z,ComputationAndBoundingOfFolkmanNumbers.pdf,Phd Thesis Under The Supervision Of Professo...,Aleksandar Bikov,math.CO,PhD Thesis,http://arxiv.org/pdf/1806.09601v2.pdf,./files/ComputationAndBoundingOfFolkmanNumbers...
3,http://arxiv.org/abs/1905.03014v1,2019-05-08T11:47:34Z,2019-05-08T11:47:34Z,OnChurch'sThesisInCubicalAssemblies.pdf,"We Show That Church's Thesis, The Axiom Stat...","Andrew Swan, Taichi Uemura,",math.LO,0,http://arxiv.org/pdf/1905.03014v1.pdf,./files/OnChurch'sThesisInCubicalAssemblies.pdf
4,http://arxiv.org/abs/1901.04911v1,2019-01-15T16:24:07Z,2019-01-15T16:24:07Z,UnconstrainedChurchTuringThesisCannotPossiblyB...,The Church Turing Thesis Asserts That If A P...,Yuri Gurevich,cs.LO,0,http://arxiv.org/pdf/1901.04911v1.pdf,./files/UnconstrainedChurchTuringThesisCannotP...


#### Preprocessing the title to list of tokens

In [0]:
df['summary'][5]

'  This Thesis Settles A Number Of Questions Related To Computational Complexityand Algebraic, Semidefinite Programming Based Relaxations In Optimization Andcontrol.'

In [0]:
titles = list(df['title'].apply(
    lambda t : 
        tuple(
            filter(lambda e:not e in stopwords.words('english'),
                map(lambda e:e.lower(),
                       re.findall('([A-Z]{1}[a-z]+)',t.replace('.pdf','')))
                )
            )
        )
    )

In [0]:
res = {}
for title in titles: #for every title in title list
    synsets = {} #new dist with synsets 
    for word in title: #for every word in title 
        synsets[word]=[synset for synset in wn.synsets(word)] #word: 
    res[title] = synsets

#### Top ten closest articles with fuzzy metrics of titles

In [0]:
def get_hypernyms_1st_level(token):
    hypernyms1 = []
    synsets_ = wn.synsets(token) 
    for i in range(len(synsets_)):
        hypernyms1.extend(wn.synsets(token)[i].hypernyms()) 
    hyper = set(hypernyms1)
    return hyper
def get_hypernyms_2nd_level(token):
    hypernym_1st_level = get_hypernyms_1st_level(token)
    hypernyms2 = []
    for hypernym1 in hypernym_1st_level:
        hypernyms2.extend(hypernym1.hypernyms())
    hyper = set(hypernyms2)
    return hyper

def common_hypernyms(a,b):
    return len(a.intersection(b))

In [0]:
def fscore_(a,b):
    intersection = a.intersection(b)
    if (len(intersection) == 0):
        fscore = 0
    else:
        prec = len(intersection)/(len(a))
        recall = len(intersection)/(len(b))
        fscore = 2 * prec * recall/(prec + recall)
    return intersection, fscore


def distance(a,b):
    a = set(a)
    b = set(b)
    interseption, fscore = fscore_(a,b)
    a = a - interseption 
    b = b - interseption 
    if len(a) == 0 or len(b) == 0: 
        rez = 1 - fscore
    else: 
        penalty = 0
        #for 1st level hypernyms
        for worda in a:
            for wordb in b:
                hyp1a = get_hypernyms_1st_level(worda)
                hyp1b = get_hypernyms_1st_level(wordb)
                number_of_1level_hyp = common_hypernyms(hyp1a, hyp1b)
                if (number_of_1level_hyp > 0):
                    penalty = penalty + 0.66
                    #only if there are not 1st level hypernyms
                if (number_of_1level_hyp == 0):
                    hyp2a = get_hypernyms_2nd_level(worda)
                    hyp2b = get_hypernyms_2nd_level(wordb)
                    number_of_2level_hyp = common_hypernyms(hyp2a, hyp2b) + common_hypernyms(hyp2a, hyp1b)+ common_hypernyms(hyp1a, hyp2b)
                    if (number_of_2level_hyp > 0):
                        penalty = penalty + 0.33
        #len_inter = len(interseption) + penalty
        if penalty > 0:
            prec_penalty = penalty/len(a)
            recall_penalty = penalty/len(b)
            fscore_penalty = 2 * (prec_penalty * recall_penalty) / (prec_penalty + recall_penalty)
        else:
            fscore_penalty = 0
                                                    
        rez = 1 - (2 * fscore + fscore_penalty)/3   
    
    return rez
                
                

In [0]:
buff = list(res.items())
dist = np.zeros((len(buff),len(buff)))
epoch = 0 #just for visualization
for lli,ll in enumerate(buff):
    for rri,rr in enumerate(buff):
        epoch = epoch + 1
        dist[lli,rri]=distance(ll[0],rr[0])
        if (epoch%10000 == 0):
            print('####', epoch)

#### 10000
#### 20000
#### 30000
#### 40000
#### 50000
#### 60000
#### 70000
#### 80000
#### 90000
#### 100000
#### 110000
#### 120000
#### 130000
#### 140000
#### 150000
#### 160000
#### 170000
#### 180000
#### 190000
#### 200000
#### 210000
#### 220000
#### 230000
#### 240000
#### 250000
#### 260000
#### 270000
#### 280000
#### 290000
#### 300000
#### 310000
#### 320000
#### 330000
#### 340000
#### 350000
#### 360000
#### 370000
#### 380000
#### 390000
#### 400000
#### 410000
#### 420000
#### 430000
#### 440000
#### 450000
#### 460000
#### 470000
#### 480000
#### 490000
#### 500000
#### 510000
#### 520000
#### 530000
#### 540000
#### 550000
#### 560000
#### 570000
#### 580000
#### 590000
#### 600000
#### 610000
#### 620000
#### 630000
#### 640000
#### 650000
#### 660000
#### 670000
#### 680000
#### 690000
#### 700000
#### 710000
#### 720000
#### 730000
#### 740000
#### 750000
#### 760000
#### 770000
#### 780000
#### 790000
#### 800000
#### 810000
#### 820000
#### 830000
#### 840000
#

In [0]:
print(dist)

[[0.         1.         0.90222222 ... 0.98166667 0.92666667 0.96615385]
 [1.         0.         0.90571429 ... 0.912      0.978      0.94      ]
 [0.90222222 0.90571429 0.         ... 1.         0.96       0.96333333]
 ...
 [0.98166667 0.912      1.         ... 0.         0.98428571 0.91111111]
 [0.92666667 0.978      0.96       ... 0.98428571 0.         0.97066667]
 [0.96615385 0.94       0.96333333 ... 0.91111111 0.97066667 0.        ]]


In [0]:
@interact(ind=(0,len(buff)-1,1))
def h(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint([buff[i][0] for i in dist[ind][:].argsort()[1:11]])


interactive(children=(IntSlider(value=0, description='ind', max=995), Output()), _dom_classes=('widget-interac…

In [0]:
for ind in range(len(df)):
  print(' '.join(buff[ind][0]))
  print([buff[i][0] for i in dist[ind][:].argsort()[1:11]])
  print('#########################')

much card deck well shuffled
[('efficient', 'information', 'aggregation', 'strategies', 'distributed', 'control', 'signal', 'processing'), ('paired', 'disjoint', 'path', 'covers', 'balanced', 'hypercubes'), ('explicit', 'gl', 'trace', 'formulas', 'uniform', 'mixed', 'weyl', 'laws'), ('proof', 'theory', 'work', 'complexity', 'analysis', 'term', 'rewrite', 'systems'), ('betti', 'numbers', 'locally', 'compact', 'groups'), ('distributed', 'weight', 'balancing', 'directed', 'topologies'), ('logical', 'concurrency', 'control', 'sequential', 'proofs'), ('moduli', 'flat', 'su', 'bundles', 'klein', 'bottle'), ('much', 'quantum', 'mechanics', 'really', 'needed', 'defy', 'extended', 'church', 'turing', 'thesis'), ('quantum', 'information', 'processing', 'adversarial', 'devices')]
#########################
quantum isometry groups
[('operator', 'algebras', 'quantum', 'computation'), ('foundations', 'quantum', 'decoherence'), ('cryptography', 'quantum', 'world'), ('quantum', 'complexity', 'classes')

In [0]:
#As a proof of working chech some cases, when the resul was visible normal
print(' '.join(buff[296][0]))
print([buff[i][0] for i in dist[296][:].argsort()[1:11]])
print(' '.join(buff[77][0]))
print([buff[i][0] for i in dist[77][:].argsort()[1:11]])

resource optimization fault tolerant quantum computing
[('quantum', 'computing', 'phase', 'estimation', 'applications'), ('quantum', 'algorithms', 'scientific', 'computing', 'approximate', 'optimization'), ('logic', 'synthesis', 'fault', 'tolerant', 'quantum', 'computers'), ('quantum', 'complexity', 'classes'), ('adiabatic', 'quantum', 'computing'), ('computing',), ('theory', 'measurement', 'based', 'quantum', 'computing'), ('completeness', 'quantum', 'computation', 'models'), ('computing', 'value', 'computation', 'planning'), ('proposal', 'quantum', 'fisher', 'information', 'optimization', 'relation', 'entanglement', 'measures')]
recognizing semantic features faces using deep learning
[('deep', 'reinforcement', 'learning', 'using', 'capsules', 'advanced', 'game', 'environments'), ('deep', 'learning', 'attributed', 'graphs', 'journey', 'graphs', 'embeddings', 'back'), ('machine', 'learning', 'approach', 'recovery', 'scene', 'geometry', 'images'), ('distributed', 'supervised', 'learning

In [0]:
@interact(ind=(0,len(buff)-1,1))
def hypernyms(ind=0):
    pp = pprint.PrettyPrinter(indent=4)
    print(' '.join(buff[ind][0]))
    pp.pprint(buff[ind][1])

much card deck well shuffled
{   'card': [   Synset('card.n.01'),
                Synset('card.n.02'),
                Synset('card.n.03'),
                Synset('card.n.04'),
                Synset('wag.n.01'),
                Synset('poster.n.01'),
                Synset('calling_card.n.02'),
                Synset('card.n.08'),
                Synset('menu.n.01'),
                Synset('batting_order.n.01'),
                Synset('circuit_board.n.01'),
                Synset('tease.v.07'),
                Synset('card.v.02')],
    'deck': [   Synset('deck.n.01'),
                Synset('deck.n.02'),
                Synset('pack_of_cards.n.01'),
                Synset('deck.n.04'),
                Synset('deck.v.01'),
                Synset('deck.v.02'),
                Synset('deck.v.03')],
    'much': [   Synset('much.n.01'),
                Synset('much.a.01'),
                Synset('much.r.01'),
                Synset('much.r.02'),
                Synset('a_lot.r.01'),
      