In [1]:
from dask.distributed import Client, LocalCluster
from dask import delayed

In [2]:
import dask.bag as db
import os
import json
from operator import itemgetter
from operator import add

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import word_tokenize

from collections import Counter

import time
import itertools

import numpy as np

In [None]:
#make client

client = Client()
client

In [None]:
#load data and repartition
filename = os.path.join('data', 'papers_in_json_singleline', '*.json')
lines = db.read_text(filename)
js = lines.map(json.loads).repartition(10)

## definition of functions

In [None]:
#merge all the body texts in one for each file
def merge(record):
    text=''
    for rec in record:
        text+=rec['text']
    return text

texts = js.pluck("body_text").map(merge)
texts.take(1)

In [None]:
#function for text cleaning
#we remove punctuation, numbers and stopwords 
#stopwords are taken from a library but also definied by us
#after this files are lists of words

tokenizer = RegexpTokenizer(r'\w+')
no_words={"i", "as", "or", "it", "et", "also", "may"}

def clean_func(text):

    def merge_text(text,stop_words):
        new_sentence =''
        for w in text:
            if w.lower() not in stop_words and w.isalpha(): 
                new_sentence += w 
                new_sentence += " "
        return new_sentence

    stop_words = set(stopwords.words('english'))
    result = word_tokenize(text)
    result = merge_text(result,stop_words.union(no_words))
    result = tokenizer.tokenize(result)
    return result

text_clean = texts.map(clean_func)
text_clean.take(1)

In [None]:
# we transform each list in a list of dictionaries 
#where the unique words and their frequency is stored

def count_words(text):
    counts = dict(zip(Counter(text).keys(), Counter(text).values()))
    wList = [{"word":x , "counts": y} for x,y in counts.items()]
    return wList

words = text_clean.map(count_words)
words.take(1)

In [None]:
# at last we sum over all file using the foldby method
# we pass the bag to the foldby after flattening it 
#the methos accesse the dictiornarios gropyng them with the value associated to the key word
# and the sum the counts of each word

def incr_amount(tot, x):
    return tot+x['counts']


total_counts = words.flatten().foldby('word', binop=incr_amount, 
                   initial=0, 
                   combine=add, 
                   combine_initial=0).compute()

total_counts

In [None]:
# at last we order the obtained list

total_counts_ordered = sorted(total_counts, key=itemgetter(1), reverse=True)
total_counts_ordered

In [None]:
#timing of the operation

start = time.time()
word_count = (lines.map(json.loads).repartition(10).pluck('body_text')
                   .map(merge).map(clean_func)
                   .map(count_words).flatten()
                   .foldby('word', binop=incr_amount, 
                        initial=0, combine=add, 
                        combine_initial=0).compute())
word_sorted = sorted(word_count, key=itemgetter(1), reverse=True)
end = time.time()

print("time: ", end - start)
print(word_sorted[:10])

In [None]:
client.close()

In [None]:
cluster = LocalCluster(n_workers=2)
client = Client(cluster)

In [None]:
start = time.time()
word_count = (lines.map(json.loads).repartition(10).pluck('body_text')
                   .map(merge).map(clean_func)
                   .map(count_words).flatten()
                   .foldby('word', binop=incr_amount, 
                        initial=0, combine=add, 
                        combine_initial=0).compute())
word_sorted = sorted(word_count, key=itemgetter(1), reverse=True)
end = time.time()

print("time: ", end - start)
print(word_sorted[:10])

In [None]:
def get_time(workers=4, partitions=10):
    myCluster = LocalCluster(n_workers=workers)
    client = Client(myCluster) #make client
    start = time.time() #strat taking time
    word_count = (lines.map(json.loads).repartition(partitions).pluck('body_text')
                       .map(merge).map(clean_func)
                       .map(count_words).flatten()
                       .foldby('word', binop=incr_amount, 
                            initial=0, combine=add, 
                            combine_initial=0).compute())
    word_sorted = sorted(word_count, key=itemgetter(1), reverse=True)
    end = time.time()
    client.close() #close client
    myCluster.close() #close cluster
    return end-start
    

In [None]:
#for nw in range (3,5):
#    print("nw ", nw, " time ", get_time(nw))

In [None]:
#works = [1,2,3,4,5,6,7,8]
#parts = [1,2,5,10,50, 100, 150]
works = [8]
parts = [8]
for w in works:
    for p in parts:
        print("nw: ", w, " Partitions ", p, " time: ", get_time(w,p))

In [None]:
get_time(12, 12)

In [None]:
get_time(24, 48)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import bokeh.palettes as palette

In [None]:
height = [word_sorted[i][1] for i in range(0,len(word_sorted))]
bars = [word_sorted[i][0] for i in range(0,len(word_sorted))]
y_pos = np.arange(len(bars))

fig, ax = plt.subplots(figsize=(10,8))
index = 30
# Create bars
ax.barh(y_pos[:index], height[:index],color=palette.inferno(index))
plt.yticks(y_pos[:index], bars[:index])
ax.grid(True, which="both", ls="-",color='0.93')
ax.set_axisbelow(True)

## parte 2

In [None]:
filename = os.path.join('data', 'papers_in_json_singleline', '*.json')
lines = db.read_text(filename)
js = lines.map(json.loads).repartition(10)

In [None]:
cluster=LocalCluster(n_workers=4)
client= Client(cluster)

In [None]:
js.pluck('metadata').pluck('authors').take(2)

In [None]:
authors = js.pluck('metadata').pluck('authors')

In [None]:
def flatten(record):
    uni=loc=lab=''
    if 'institution' in record['affiliation'].keys():
        uni=record['affiliation']['institution']
    else: uni="Unknown"
    
    if 'laboratory' in record['affiliation'].keys():
        lab=record['affiliation']['laboratory']
    else: lab="Unknown"
    
    if 'location' in record['affiliation'].keys():
        if 'country' in record['affiliation']['location'].keys():
            loc=record['affiliation']['location']['country']
        else: loc="Unknown"
    else: loc="Unknown"
    
    if uni=='': uni="Unknown"
    if lab=='': lab="Unknown"
    if loc=='': loc="Unknown"
    
    return {
        'name':       record['first'],
        'surname':    record['last'], 
        'University': uni,
        'Laboratory': lab,
        'Country':    loc,
    }

authors.flatten().map(flatten).take(12)

In [None]:
auth_df = authors.flatten().map(flatten).to_dataframe()
auth_df.head()

In [None]:
univs = auth_df.University.value_counts().nlargest(10).compute()
univs

In [None]:
labos = auth_df.Laboratory.value_counts().nlargest(10).compute()
labos

In [None]:
countries = auth_df.Country.value_counts().nlargest(10).compute()
countries

In [None]:
def get_time_univs(workers=4, partitions=10):
    myCluster = LocalCluster(n_workers=workers)
    client = Client(myCluster) #make client
    start = time.time() #strat taking time
    auth_df = (lines.map(json.loads).repartition(10)
                    .pluck('metadata').pluck('authors')
                    .flatten().map(flatten)
                    .to_dataframe())
    univs = auth_df.University.value_counts().nlargest(10).compute()
    end = time.time()
    client.close() #close client
    myCluster.close() #close cluster
    return end-start

def get_time_countries(workers=4, partitions=10):
    myCluster = LocalCluster(n_workers=workers)
    client = Client(myCluster) #make client
    start = time.time() #strat taking time
    auth_df = (lines.map(json.loads).repartition(10)
                    .pluck('metadata').pluck('authors')
                    .flatten().map(flatten)
                    .to_dataframe())
    countries = auth_df.Country.value_counts().nlargest(10).compute()
    end = time.time()
    client.close() #close client
    myCluster.close() #close cluster
    return end-start

def get_N_univs(workers=4, partitions=10, N=10):
    myCluster = LocalCluster(n_workers=workers)
    client = Client(myCluster) #make client
    auth_df = (lines.map(json.loads).repartition(10)
                    .pluck('metadata').pluck('authors')
                    .flatten().map(flatten)
                    .to_dataframe())
    univs = auth_df.University.value_counts().nlargest(N).compute()
    client.close() #close client
    myCluster.close() #close cluster
    return univs
def get_N_countries(workers=4, partitions=10, N=10):
    myCluster = LocalCluster(n_workers=workers)
    client = Client(myCluster) #make client
    auth_df = (lines.map(json.loads).repartition(10)
                    .pluck('metadata').pluck('authors')
                    .flatten().map(flatten)
                    .to_dataframe())
    countries = auth_df.Country.value_counts().nlargest(N).compute()
    client.close() #close client
    myCluster.close() #close cluster
    return countries
    

In [None]:
get_time_univs(10,10)

In [None]:
get_time_countries(10,10)

In [None]:
get_N_univs(10,10, 10)

In [None]:
get_N_countries(10,10, 10)

# parte 3

In [None]:
cluster=LocalCluster(n_workers=4)
client= Client(cluster)
client


In [None]:
filename = os.path.join('data', 'papers_in_json_singleline', '*.json')
lines = db.read_text(filename)
js = lines.map(json.loads).repartition(10)

In [None]:
metas = js.pluck(["paper_id", "metadata"])
m=metas.take(1)[0]

In [None]:
print (m[1]["title"])

In [None]:
import fasttext
import fasttext.util
ft = fasttext.load_model('/home/alessandro/Downloads/cc.en.300.bin')
ft.get_dimension() 
fasttext.util.reduce_model(ft, 100)

In [None]:
import io
def load_vectors(fname):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = list(map(float, tokens[1:]))
    return data
#model = load_vectors('/home/alessandro/Downloads/wiki-news-300d-1M.vec')
#to get the embedding of word ’hello’:
#model['hello']

model = fast

In [None]:
model.load

In [None]:
text = (m[1]["title"])
text_split = text.split()

text_embedded = []
for t in text_split:
    try:
        text_embedded.append(model[t])
    except:
        pass
    
#text_embedded =[ model[t] for t in text_split]
text_embedded
#text_split

In [None]:
try: 
    t = np.array(model['PfSWIB'])
except:
    t= np.zeros(1)

t

In [None]:
def embedd (text):
    text_split = text.split()
    text_embedded = []
    for t in text_split:
        try:
            text_embedded.append(model[t])
        except:
            pass
    return text_embedded

def reco_emb(reco):
    print(reco)
    return {
        "paper_id": reco['paper_id'],
        "title": embedd(reco['title'])
    }

def flatten(reco):
    text= reco[1]['title']
    #emb_text=embedd(text)
    print(text)
    return {
        "paper_id": reco[0],
        "title": reco[1]['title']
    }

In [None]:
titles = db.from_sequence(metas.map(flatten).compute())
titles.take(3)


#titles = metas.map(flatten_embedding)
#titles.take(3)

In [None]:
t = titles.pluck('title').take(1)
#reco_emb(t[0])
t=t[0]
t
#ids = titles.pluck('paper_id').take(3)
#et = [embedd(x) for x in t]

model['hello']


In [None]:
et = t.map(reco_emb).compute()


In [None]:
et

In [None]:
titles = metas.map(flatten).to_dataframe().compute())


In [None]:

#c = list(itertools.product(a, b))

In [None]:
grid = db.from_sequence(list(itertools.product(small_titl, small_titl))).repartition(10)

In [None]:
grid.take(2, npartitions=5)

In [None]:
list(itertools.product(small_titl, small_titl))

# PARTE 3 FUNZIONANTE

In [3]:
cluster=LocalCluster(n_workers=4)
client= Client(cluster)
client


0,1
Client  Scheduler: tcp://127.0.0.1:42007  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 16.80 GB


In [4]:
filename = os.path.join('data', 'embedded_papers', '*.json')
js = db.read_text(filename).map(json.loads)


In [5]:
def back_to_float(l):
    return np.array([[float(n) for n in arr]for arr in l])

def db_to_float(reco):
    return {
        "paper_id": reco['paper_id'],
        "title": back_to_float(reco['title'])
    }

titles = db.from_sequence(js.map(db_to_float).compute())

#il modell che usiamo converte ogni parola in un array di 300 numeri
#quindi ogni titolo compost da n parole sarà ora un np array nx300

In [None]:
titl = titles.take(2)
#t1 = titl[0]['title']
#t2 = titl[1]['title']



In [9]:
t= js.map(db_to_float).compute()

def make_pairs(reco):
    return db.from_sequence([(reco, p2) for p2 in t]).repartition(1000)

In [10]:
titles =db.from_sequence(t).repartition(100)
grid = titles.map(make_pairs).compute()


KeyboardInterrupt: 

In [None]:
elem = grid.take(2)

In [None]:
#forse da distemare la metrica ma funzia

def cos_sim(x,y):
    l=min(x.shape[0], y.shape[0])
    nx = np.array([np.linalg.norm(x1) for x1 in x])
    ny = np.array([np.linalg.norm(y1) for y1 in y])
    prod = np.sum(x[:l]*y[:l], axis=1)/(nx*ny)
    return np.mean(prod)

In [None]:
cos_sim(t1,t1)

In [None]:
e = elem[0].take(2)    

def make_sim(reco):
    try:
        c= cos_sim(reco[0]['title'], reco[1]['title'])
    except:
        c=0
    return {
        "Paper 1:": reco[0]['paper_id'],
        "Paper 2:": reco[1]['paper_id'],
        "Similarity": c
    }

def make_sim_bag(reco):
    return reco.map(make_sim).compute().re



In [None]:
similarities = grid.map(make_sim_bag).compute()

In [None]:
print(t1.shape)
print(t2.shape)
n= min(t1.shape[0], t2.shape[0])
print(n)

In [None]:
%% time
[np.dot(t1[i], t2[i]), for i in]

In [None]:
a=np.array([[1,2], [3,4]])
b=np.array([[-1,-2], [0.5, 10]])

In [None]:
np.inner(a,b)

In [None]:
a*b

In [None]:
np.sum(a*b, axis=1)

In [None]:
cos_sim(a,a)

In [None]:
a=np.array([[1,2], [3,4]])
a*a

In [None]:
np.sum(a*a, axis=1)

In [None]:
np.array([np.linalg.norm(b) for b in a])

In [None]:
np.linalg.norm(np.array([1,2,3,4]))