In [1]:
import pickle
import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader
import tqdm
import random
from csv import writer
from sentence_transformers import SentenceTransformer, util, SentencesDataset, InputExample, losses, evaluation, models
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import sklearn
from numba import cuda
import math
import json

from dateutil.relativedelta import relativedelta

In [2]:
with open('dataframes\\final_wa_df.pickle', 'rb') as handle:
    wa_df= pickle.load(handle)

with open('dataframes\\final_stan_df.pickle', 'rb') as handle:
    stan_df = pickle.load(handle)

wa_content = wa_df.content.to_list()
stan_content = stan_df.content.to_list()

#### RobBERT - mqa - wastan
https://huggingface.co/jegormeister/robbert-v2-dutch-base-mqa-finetuned

#### jegor sts wastan

In [5]:
model_save_path = 'models\\robbert-v2-dutch-base-mqa-wastan-finetuned'
model = SentenceTransformer(model_save_path, device='cuda')

In [None]:
wa_embeddings = model.encode(wa_content, show_progress_bar=True)
stan_embeddings = model.encode(stan_content, show_progress_bar=True)

Batches:   0%|          | 0/274 [00:00<?, ?it/s]

Batches:   0%|          | 0/6322 [00:00<?, ?it/s]

In [13]:
with open('embeddings\wa_mqa_wastan.pickle', 'wb') as handle:
    pickle.dump(wa_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('embeddings\stan_mqa_wastan.pickle', 'wb') as handle:
    pickle.dump(stan_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('embeddings\wa_mqa_wastan.pickle', 'rb') as handle:
#     wa_embeddings = pickle.load(handle)

# with open('embeddings\stan_mqa_wastan.pickle', 'rb') as handle:
#     stan_embeddings = pickle.load(handle)

#### robbert base wastan
https://huggingface.co/pdelobelle/robbert-v2-dutch-base

#### RobBERT - wastan

In [3]:
model_save_path = 'models\\robbert-v2-dutch-base-wastan-finetuned'
model = SentenceTransformer(model_save_path, device='cuda')

In [4]:
wa_embeddings = model.encode(wa_content, show_progress_bar=True)
stan_embeddings = model.encode(stan_content, show_progress_bar=True)

Batches:   0%|          | 0/274 [00:00<?, ?it/s]

Batches:   0%|          | 0/6322 [00:00<?, ?it/s]

In [5]:
with open('embeddings\wa_robbert_wastan.pickle', 'wb') as handle:
    pickle.dump(wa_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('embeddings\stan_robbert_wastan.pickle', 'wb') as handle:
    pickle.dump(stan_embeddings, handle, protocol=pickle.HIGHEST_PROTOCOL)

# with open('embeddings\wa_robbert_wastan.pickle', 'rb') as handle:
#     wa_embeddings = pickle.load(handle)

# with open('embeddings\stan_robbert_wastan.pickle', 'rb') as handle:
#     stan_embeddings = pickle.load(handle)

### Calculate similarity

In [6]:
wa_df = wa_df.drop("embeddings", axis=1)
stan_df = stan_df.drop("embeddings",axis=1)

In [7]:
wa_df["embeddings"] = wa_embeddings.tolist()
stan_df["embeddings"] = stan_embeddings.tolist()

In [21]:
wa_df["date"] = pd.to_datetime(wa_df['date'], format='%Y%m%d')
stan_df["date"] = pd.to_datetime(stan_df['date'], format='%Y%m%d')

### Retrieve top3 articles

- voor elke rij in wa_df: filter standf obv relativedelta(weeks=-2)
- convert content van limited df naar lijst
- voor elke element in lijst: calculate cosine similarity
- sla index op van top 3 scores

In [6]:
def find_all_indexes(list, searchcrit):
    indexes = []
    for idx, score in enumerate(list):
        if score == searchcrit:
            indexes.append(idx)
    return indexes

In [7]:
# Code to retrieve the top 3 most similar articles within 2 weeks of the wablieft article
def search_top3(index=int, wa_df=wa_df, stan_df=stan_df):
    resultlist = []
    row = wa_df.iloc[index]
    limited_stan_df = stan_df[(stan_df["date"] >= row.date + relativedelta(weeks=-2)) & (stan_df["date"] <= row.date)]
    for emb in limited_stan_df.embeddings.tolist():
        resultlist.append((util.cos_sim(row.embeddings, emb)).item())

    ## Get top 3
    sortedlist = sorted(resultlist, reverse=True)
    indexlist = []
    for article in sortedlist[:3]:
        idx = find_all_indexes(resultlist, article)
        # idx = resultlist.index(article)
        if len(idx) == 1:
            stanfile = limited_stan_df.iloc[idx[0]]
            indexlist.append(stanfile.filename)
        else:
            for index in idx:
                stanfile = limited_stan_df.iloc[index]
                indexlist.append(stanfile.filename)
            break
    # indexlist = [limited_stan_df.iloc[resultlist.index(article)].filename for article in sortedlist[:3]]
    return sortedlist, indexlist



In [7]:
## Return standaard article indexes based on the score
sortedlist, indexlist= search_top3(150)

In [18]:
with open('embeddings\wa_mqa_wastan.pickle', 'rb') as handle:
    wa_embeddings = pickle.load(handle)

with open('embeddings\stan_mqa_wastan.pickle', 'rb') as handle:
    stan_embeddings = pickle.load(handle)

In [19]:
wa_df = wa_df.drop("embeddings", axis=1)
stan_df = stan_df.drop("embeddings",axis=1)

In [20]:
wa_df["embeddings"] = wa_embeddings.tolist()
stan_df["embeddings"] = stan_embeddings.tolist()

In [8]:
simDict_mqa_WaStan = {wa_df.iloc[i].filename: {'Standaard': search_top3(i)[1], 'Score': search_top3(i)[0][:3]} for i in range(0, len(wa_content))}

In [9]:
with open('comparable_corpora\simDict_mqa_WaStan_definitive.json', 'w') as fp:
    json.dump(simDict_mqa_WaStan, fp,  indent=4)

In [3]:
wa_df = wa_df.drop("embeddings", axis=1)
stan_df = stan_df.drop("embeddings",axis=1)

In [4]:
with open('embeddings\wa_robbert_wastan.pickle', 'rb') as handle:
    wa_embeddings = pickle.load(handle)

with open('embeddings\stan_robbert_wastan.pickle', 'rb') as handle:
    stan_embeddings = pickle.load(handle)

In [5]:
wa_df["embeddings"] = wa_embeddings.tolist()
stan_df["embeddings"] = stan_embeddings.tolist()

In [8]:
simDictRobBERT_WaStan = {wa_df.iloc[i].filename: {'Standaard': search_top3(i)[1], 'Score': search_top3(i)[0][:3]} for i in range(0, len(wa_content))}

In [10]:
with open('comparable_corpora\simDict_RobBERT_WaStan.json', 'w') as fp:
    json.dump(simDictRobBERT_WaStan, fp,  indent=4)

---------------------

In [2]:
def getAllSimScores(dict):
    simScoreList = []
    for wa, subdict in dict.items():
        for i in range(0, len(subdict.get("Score"))):
            simScoreList.append(subdict.get("Score")[i])
    
    return simScoreList

def getTop1SimScores(dict):
    simScoreList = []
    for wa, subdict in dict.items():
        try:
            simScoreList.append(subdict.get("Score")[0])
        except IndexError:
            continue
    
    return simScoreList

In [5]:
# Load in previously saved similarity dictionary
with open('comparable_corpora\simDict_RobBERT_WaStan.json') as json_file:
    robb_wastan = json.load(json_file)

# Load in previously saved similarity dictionary
with open('comparable_corpora\simDict_mqa_WaStan.json') as json_file:
    mqa_wastan = json.load(json_file)

In [7]:
top1_robbwastan = getTop1SimScores(robb_wastan)

with open('comparable_corpora\similarity_scores\\top1SimScoreListRobBERTWaStan.pickle', 'wb') as handle:
    pickle.dump(top1_robbwastan, handle, protocol=pickle.HIGHEST_PROTOCOL)

top1_mqawastan = getTop1SimScores(mqa_wastan)

with open('comparable_corpora\similarity_scores\\top1SimScoreListmqaWaStan.pickle', 'wb') as handle:
    pickle.dump(top1_mqawastan, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [26]:
import itertools
def getSimfromDict(dict, amount=int):
    walist = []
    stanlist = []
    scorelist = []
    for wa, value in itertools.islice(dict.items(), amount):
        walist.append(wa_df.loc[wa_df['filename']==wa]["content"].tolist()[0])
        stanlist.append(value.get("Standaard"))
        scorelist.append(value.get("Score"))
    
    return walist, stanlist, scorelist
        # for stan, score in value.value:
        #     for i in range(0, len(stan.values())):
        #         print("Standaard article: " + stan_df.loc[stan_df["filename" == stan.values()[i]]]+"\n"+"Score: "+score.values()[i])

In [30]:
def printSimilarArticles(wablieftContent, stanFileNames, Scores):
    for i in range(0, len(stanFileNames)):
        print("\n\n"+"Wablieft article: "+wablieftContent[i]+"\n")
        for j in range(0, 3):
            try:
                stanContent = stan_df.loc[stan_df["filename"] == stanFileNames[i][j]]['content'].tolist()
                score = Scores[i][j]
            except IndexError:
                break

            print("Standaard article: "+stanContent[0])
            print("Score: "+ str(score)+"\n")

In [13]:
wab, stan, scores = getSimfromDict(dict=simDict, amount=7000)

In [None]:
printSimilarArticles(wab, stan, scores)

### Create dictionary of top 1 similar article

In [42]:
def search_top1(index=int, wa_df=wa_df, stan_df=stan_df):
    resultlist = []
    row = wa_df.iloc[index]
    limited_stan_df = stan_df[(stan_df["date"] >= row.date + relativedelta(weeks=-2)) & (stan_df["date"] <= row.date)]
    for emb in limited_stan_df.embeddings.tolist():
        resultlist.append((util.cos_sim(row.embeddings, emb)).item())

    ## Get top 3
    sortedlist = sorted(resultlist, reverse=True)
    indexlist = []
    for article in sortedlist[:1]:
        idx = find_all_indexes(resultlist, article)
        # idx = resultlist.index(article)
        if len(idx) == 1:
            stanfile = limited_stan_df.iloc[idx[0]]
            indexlist.append(stanfile.filename)
        else:
            for index in idx:
                stanfile = limited_stan_df.iloc[index]
                indexlist.append(stanfile.filename)
            break
    # indexlist = [limited_stan_df.iloc[resultlist.index(article)].filename for article in sortedlist[:3]]
    return sortedlist, indexlist

#### Thresholding to construct comparable corpus

In [20]:
score_threshold = 0.642
simDictTop1 = {}
for i in range(0, len(wa_content)):
    sim_score, stan_filename = search_top1(i)
    try:


        if float(sim_score[0]) >= score_threshold:
            simDictTop1[wa_df.iloc[i].filename] = {'Standaard': stan_filename[0], 'Score': sim_score[0]}
        else:
            continue
    except IndexError:
        continue

In [21]:
with open('similarityDictionaryTop1.json', 'w') as fp:
    json.dump(simDictTop1, fp,  indent=4)

In [3]:
with open('comparable_corpora\simDict_LogRegClassified.json') as json_file:
    simDict = json.load(json_file)

In [4]:
def getSimfromDict(dict):
    walist = []
    stanlist = []
    scorelist = []
    for wa, value in dict.items():
        walist.append(wa_df.loc[wa_df['filename']==wa]["content"].tolist()[0])
        stanlist.append(value.get("Standaard"))
        scorelist.append(value.get("Score"))
    
    return walist, stanlist, scorelist

In [5]:
def printSimilarArticles(wablieftContent, stanFileNames, Scores):
    for i in range(0, len(stanFileNames)):
        print("\n\n"+"Wablieft article: "+wablieftContent[i]+"\n")
        try:
            stanContent = stan_df.loc[stan_df["filename"] == stanFileNames[i]]['content'].tolist()
            score = Scores[i]
        except IndexError:
            break

        print("Standaard article: "+stanContent[0])
        print("Score: "+ str(score)+"\n")

In [6]:
wab, stan, scores = getSimfromDict(dict=simDict)

In [None]:
printSimilarArticles(wab, stan, scores)

### Create training set with scored articles

##### scorelists

In [59]:
matchesList = [0.7697,
        0.72141486,
        0.71388,
        0.788607,
        0.836498,
        0.704664,
        0.7805206,
        0.80987,
        0.797313,
        0.743338346,
        0.799838,
        0.781242,
        0.673715,
        0.7851778864,
        0.680668,
        0.8285657,
        0.77680337,
        0.89255422,
        0.710689,
        0.792078137,
        0.75573337,
        0.8639686,
        0.923899,
        0.8299018,
        0.818773388,
        0.774552166,
        0.791702,
        0.6543264,
        0.788223028,
        0.7355657,
        0.67106187,
        0.580847,
        0.627487,
        0.66209,
        0.681355,
        0.768626,
        0.733748,
        0.69154,
        0.51817,
        0.681477,
        0.78126,
        0.7633886,
        0.8638988,
        0.5258505,
        0.783164,
        0.777009,
        0.64473,
        0.729588,
        0.763598,
        0.8209926,
        0.8132002,
        0.7953649,
        0.776327,
        0.755874,
        0.756482899,
        0.75797,
        0.7536767,
        0.6356,
        0.708218,
        0.76766467,
        0.7696149,
        0.61336,
        0.6577181,
        0.7659018,
        0.759663,
        0.76228386,
        0.85247099,
        0.73827588,
        0.4924735,
        0.7233466,
        0.76480269,
        0.63647,
        0.72138,
        0.8175677,
        0.70753,
        0.737872,
        0.753278,
        0.838064,
        0.726966,
        0.8079567,
        0.653346,
        0.74183446,
        0.81097239,
        0.689377367,
        0.62185,
        0.75594,
        0.7558307,
        0.7798176,
        0.847305,
        0.754778,
        0.642159,
        0.74452215,
        0.7798978,
        0.75354,
        0.5777378,
        0.799755,
        0.73424,
        0.5977584,
        0.686517,
        0.6422056,
        0.6759755,
        0.695020914,
        0.853763,
        0.82201665,
        0.69139,
        0.631638,
        0.76909,
        0.847588956,
        0.79919,
        0.78601,
        0.7322576,
        0.7635,
        0.77563,
        0.72278499,
        0.811708,
        0.66525,
        0.6677179,
        0.78369,
        0.6500568,
        0.76673,
        0.718989,
        0.73766,
        0.77780485,
        0.751827,
        0.77792,
        0.5125471,
        0.7146214,
        0.712471485,
        0.64337754249,
        0.666738927,
        0.6788367629,
        0.6716255,
        0.843695998,
        0.674108445644,
        0.7829052805,
        0.70365959405899,
        0.77655506,
        0.6633684635,
        0.8291893601,
        0.7387883067,
        0.73019278049,
        0.6969285607,
        0.714788675,
        0.760915637,
        0.8273297548,
        0.923899471759,
        0.9047259688,
        0.9037084579467,
        0.5706487298,
        0.77131378,
        0.818009138,
        0.7564840316,
        0.7415358424,
        0.83821976,
        0.7958508729934,
        0.816393852233,
        0.855634,
        0.764023065567,
        0.6651859879,
        0.639792382717,
        0.7571749687,
        0.82533246,
        0.834297776,
        0.833146095,
        0.83930498361,
        0.61638921499,
        0.543234169,
        0.7669585347,
        0.6891207098,
        0.63000345,
        0.752358198,
        0.78967285,
        0.760514676,
        0.6784633994,
        0.7990390658,
        0.860850214958,
        0.689050734,
        0.711645185947,
        0.746158,
        0.74563032,
        0.7669916749,
        0.7235053777,
        0.8123298,
        0.6792410016,
        0.701078534,
        0.71442246437,
        0.722155749,
        0.80628609,
        0.7795466184616,
        0.739085,
        0.78737384,
        0.823471367,
        0.637516,
        0.7135028,
        0.64341038,
        0.6608159,
        0.79101634,
        0.60918426,
        0.6051189899,
        0.72729367,
        0.89086,
        0.6616236,
        0.71711957,
        0.69035869,
        0.6787018,
        0.6538558,
        0.74488943815,
        0.791976,
        0.82162,
        0.7984265089,
        0.76979,
        0.6847710609,
        0.8136815,
        0.6635019,
        0.87014955,
        0.820418238,
        0.74413615,
        0.69744706,
        0.7076369,
        0.704141259,
        0.772098,
        0.7256688,
        0.74525088,
        0.771788299,
        0.88479,
        0.78804278,
        0.703075,
        0.581823587,
        0.7161303,
        0.72783309,
        0.787395,
        0.714924,
        0.7885076999,
        0.7974855899,
        0.82318055,
        0.732838809,
        0.66058075,
        0.719847,
        0.809242546,
        0.7810799,
        0.746488,
        0.68827325,
        0.62747842,
        0.70261967,
        0.8243128,
        0.7264975309,
        0.7546389,
        0.7904949,
        0.73430216,
        0.739148557,
        0.64654779,
        0.81631809,
        0.7605623,
        0.4436697,
        0.7363989,
        0.840628445,
        0.7958248,
        0.8284159,
        0.81944477,
        0.7998687,
        0.7483889,
        0.76568597,
        0.778772,
        0.77987897,
        0.773603618,
        0.800198,
        0.73052,
        0.7266547,
        0.727787077,
        0.6135877,
        0.6108038,
        0.710565,
        0.6744478
        ]


In [60]:
noMatchesList = [
    0.5535407,
    0.6986519,
    0.51107543,
    0.4589526,
    0.5063756,
    0.52886277,
    0.5447489,
    0.55582004,
    0.57022494,
    0.552105307579,
    0.6082630157,
    0.57582497,
    0.63724285,
    0.61024689,
    0.5978436,
    0.595395088,
    0.5180177,
    0.622035026,
    0.689134597,
    0.48909914449,
    0.498851418,
    0.44220787,
    0.512252509,
    0.548665285,
    0.5386719,
    0.50775814056,
    0.4760343,
    0.569921,
    0.4769304,
    0.571814239,
    0.5141970515,
    0.55523240566,
    0.6670523881,
    0.65486276,
    0.642258465,
    0.5293843746,
    0.622388899,
    0.59334117174,
    0.6066381335,
    0.52446991205,
    0.660279095,
    0.61309307,
    0.61719208955,
    0.509504199,
    0.53461307,
    0.55632734,
    0.5305280089378,
    0.5029930472,
    0.60603237,
    0.49210578,
    0.635853469,
    0.6405651569,
    0.678285896778,
    0.513371527,
    0.71126574,
    0.455496877,
    0.67995160818,
    0.691637158,
    0.637583673,
    0.63278347,
    0.4872620998,
    0.668940067,
    0.61449253559,
    0.554829,
    0.4880725,
    0.54832077026,
    0.5618026256,
    0.56353968,
    0.489055693149,
    0.46197,
    0.55689138,
    0.640002429,
    0.54211628,
    0.557211399,
    0.595259,
    0.6195359,
    0.5236569,
    0.664387,
    0.6819516,
    0.503838,
    0.576270997,
    0.62811075,
    0.604570508,
    0.6187288,
    0.654753,
    0.59569638967,
    0.5765741467,
    0.65639215,
    0.560441255,
    0.52692282,
    0.62480497,
    0.5717596,
    0.4443662,
    0.5964726,
    0.63212949,
    0.6097509,
    0.61967355,
    0.68474,
    0.73098,
    0.469717,
    0.6137657,
    0.61052447557,
    0.4984967,
    0.6565629,
    0.59753525257,
    0.5660267,
    0.6557744,
    0.56982338,
    0.62308949,
    0.545513,
    0.60677,
    0.715595,
    0.7052625,
    0.61761957,
    0.64363306,
    0.6009945,
    0.592138707,
    0.61603069,
    0.598185658,
    0.5933338,
    0.6277466,
    0.5623755,
    0.6203489,
    0.5040397,
    0.73360979,
    0.7952144,
    0.45291349,
    0.41730788,
    0.54704487,
    0.4835606,
    0.5867008,
    0.49841472,
    0.556829,
    0.74117487,
    0.444108039,
    0.488289,
    0.57693058,
    0.528937,
    0.520601
]


#### Creating dataframe with previously scored articles to explore and evaluate classifier and model

In [None]:
# Load in previously saved similarity dictionary
with open('comparable_corpora\simDict_mqa_WaStan.json') as json_file:
    simDict = json.load(json_file)

In [7]:
def createTrainingSet(dict, wa_df=wa_df, stan_df=stan_df, scoreList=list, scoreRated=int):
    df = pd.DataFrame()
    waClist = []
    waTlist = []
    stanClist = []
    stanTlist = []
    scorelist = []
    matchList = []
    bMatch = False
    for score in scoreList:
        score = str(score)
        for wa, value in dict.items():
            for i in range(0, 3):
                try:
                    if str(value.get("Score")[i]).__contains__(score):
                        waClist.append(wa_df.loc[wa_df["filename"] == wa]["content"].tolist()[0])
                        waTlist.append(wa_df.loc[wa_df["filename"] == wa]["title"].tolist()[0])
                        stanClist.append(stan_df.loc[stan_df["filename"] == value.get("Standaard")[i]]["content"].tolist()[0])
                        stanTlist.append(stan_df.loc[stan_df["filename"] == value.get("Standaard")[i]]["title"].tolist()[0])
                        scorelist.append(float(score))
                        matchList.append(scoreRated)
                        bMatch = True
                        break
                except IndexError:
                    break
            if bMatch==True:
                bMatch = False
                break
    
    df["WaTitle"] = waTlist
    df["WaContent"] = waClist
    df["StanTitle"] = stanTlist
    df["StanContent"] = stanClist
    df["Score"] = scorelist
    df["Match"] = matchList
    
    return df

In [116]:
matchDf = createTrainingSet(scoreList = matchesList, scoreRated=1)
noMatchDf = createTrainingSet(scoreList=noMatchesList, scoreRated=0)
evaluationDf = matchDf.append(noMatchDf)

In [119]:
evaluationDf

Unnamed: 0,WaTitle,WaContent,StanTitle,StanContent,Score,Match
0,Minder kinderen geboren,Minder kinderen geboren In Vlaamse ziekenhuize...,66.822,Geboortes 66.822 Het afgelopen jaar werden iet...,0.769700,1
1,Vlaams medicijn tegen tbc,Vlaams medicijn tegen tbc Onderzoekers van het...,Vlaams medicijn brengt hoop voor stervenden me...,Ook Artsen zonder Grenzen vraagt al lang om he...,0.721415,1
2,Vlaams medicijn tegen tbc,Vlaams medicijn tegen tbc Onderzoekers van het...,No title,Vlaams medicijn brengt hoop in strijd tegen tbc,0.713880,1
3,Directrice verdacht van spieken,Directrice verdacht van spieken De directrice ...,Directrice laat eigen kinderen examenvragen in...,Directrice laat eigen kinderen examenvragen in...,0.788607,1
4,Noémie is Miss België,Noémie is Miss België Noémie Happart is de moo...,Noémie Happart Miss België 2013,Noémie Happart Miss België 2013 Noémie Happart...,0.836498,1
...,...,...,...,...,...,...
131,Niet te scheiden,Niet te scheiden De Amerikaanse honden Hoshi e...,No title,De kleine parade Selena Gomez stopt met optred...,0.444108,0
132,Hoog en hoger,Hoog en hoger Meer dan 12 euro voor een ritje ...,No title,De kleine parade Het huis van Freddie Als hij ...,0.488289,0
133,Ruim het op!,Ruim het op! Vorige week zat ik in de klas van...,Warenhuisketen Colruyt bindt strijd aan met zw...,' Wanneer we de strijd willen aanbinden met zw...,0.576931,0
134,China ontruimt kloosters,China ontruimt kloosters De overheid in China ...,Ook slachthuizen slachten liever niet onverdoofd,' De ene imam is nog strenger dan de andere en...,0.528937,0


In [75]:
with open('dataframes\manualRatedDf.pickle', 'wb') as handle:
    pickle.dump(evaluationDf, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('dataframes\manualRatedDf.pickle', 'rb') as handle:
    evaluationDf = pickle.load(handle)

### Testing

In [54]:
len(wa_content)

8744

In [7]:
wa_df.loc[wa_df["filename"] == 'wa1110bik1.txt']["content"].tolist()[0]

'Kies mee een naam voor babypanda In dierenpark Pairi Daiza wordt de babypanda drie maanden oud. Dan krijgt hij zijn echte naam. Nu heet hij enkel P. Een panda is een Chinees dier. Dus krijgt P een Chinese naam. Mensen kunnen mee kiezen uit vijf namen. Dat zijn Tian Bao (Schat van de Hemel), Xing Hao (Goede Ster), Ou Xing (Ster van Europa), Hua Li (China en België) en An Tuan (Verenigde Vrede). De pandaploeg van Pairi Daiza kiest uit de beste drie namen de echte naam. Stem op je favoriet op  www.pairidaiza.eu.  '

In [8]:
stan_df.loc[stan_df["filename"] == '08857008-7367-11e6-ba51-194230acc9a8_DB.000001.xml']["content"].tolist()[0]

"NATUUR Panda's niet meer met uitsterven bedreigd Het zal niet alleen aan het Belgische geboortecijfer liggen , maar op de jongste ' Rode Lijst ' is de reuzenpanda opgeschoven van ' bedreigd ' naar ' kwetsbaar '. Met de gorilla gaat het dan weer van kwaad naar erger . Van onze redacteur Pieter Van Dooren Met hun eenzijdige dieet van ' bamboe en niks dan bamboe ' hadden de reuzenpanda's zich in een kwetsbare situatie gemanoeuvreerd . De Chinezen proberen al decennia om hun aantallen op te voeren met intensieve kweek\xadprogramma's , maar de panda's werkten niet echt mee . Ook al hebben Xing Hui en Hao Hao in juni in Pairi Daiza hun plicht gedaan en de wereld verrijkt met \xadBaby P , die een week geleden zijn eerste stapjes zette . Belangrijker was de Chinese bescherming van de bamboebossen - alleen in die bossen komt de \xadreuzenpanda nog in het wild voor . Het jongste decennium gingen de aantallen met 17 procent vooruit , zegt het Wereldnatuurfonds ( dat een panda in zijn logo voert 

In [1]:
embedding = wa_df.loc[wa_df["filename"] == 'wa935bui2.txt']["embeddings"].tolist()[0]

NameError: name 'wa_df' is not defined

In [207]:
emb_test = []
for stemb in stan_df.embeddings.tolist():
    dif = util.cos_sim(embedding, stemb)
    emb_test.append(dif)

In [212]:
sortedemb = sorted(emb_test, reverse=True)

In [215]:
indexlist = []
for ele in sortedemb[:10]:
    idx = emb_test.index(ele)
    indexlist.append(idx)

In [19]:
wa_df.loc[wa_df["filename"] == 'wa973tip2.txt']["content"].tolist()[0]

"Heb jij ze wel alle vijf? De tijd van armoede in ons land is niet voorbij. Vandaag kennen heel wat mensen nog altijd armoede en uitsluiting. Wat betekent dat voor jongeren? Educatief Theater Antwerpen maakte er een toneelstuk over.   Schrijvers Marc Hendrickx en Dirk Dobbeleers duiken in de wereld van jongeren. School, tv, talentenjachten, liefde\x85  zijn belangrijk in die wereld. Hoofdrollen zijn er voor de meisjes Chelsea, Phaedra en Yasim. Maken ze hun dromen waar? Gezin en geld spelen daarbij een grote rol.  Educatief Theater Antwerpen toont 'Heb je ze wel alle vijf?' dit jaar nog 12 keer in Vlaanderen. Nadien krijgt het stuk een kans in scholen en verenigingen. Die kunnen het stuk aankopen voor een zaal tot 200 kijkers. Meer info op 03 226 42 00.  Praktisch Het toneelstuk 'Heb je ze wel alle vijf?' kan je tot 25 november gaan bekijken in Leuven, Sint-Truiden, Roeselare, Gent, Heist-op-den-Berg, Halle, Tongeren, Mechelen, Kortrijk, Hasselt, Deurne en Tienen. Meer info op www.etap

In [18]:
stan_df.loc[stan_df["filename"] == 'De_Standaard-2013-10-15(6).000001#3n.xml']['content'].tolist()[0]

"MUZIEKExtra show Like Mike & Dimi VegasHet eerste optreden van de dj's Like Mike & Dimitri Vegas in het Sportpaleis is uitverkocht , dus komt er een tweede show op 20 december . De kaartverkoop start op woensdag 23 oktober om 10 uur . De broers uit Willebroek dragen de show op aan hun overleden overgrootvader , René Mampaey , die mee aan de wieg van het Sportpaleis stond . De Facebook-pagina van de twee heeft ondertussen de kaap van twee miljoen fans gerond. ( bpr ) JeugdliteratuurBelgen kandidaat voor LindgrenprijsBij de 238 genomineerden voor de Astrid Lindgren Memorial Award 2014 zijn vijf Belgen : illustratoren Carll Cneut en Klaas Verplancke en schrijvers Bart Moeyaert , Anne Herbauts en Thomas Lavachery . Ook de Waalse literatuurprijs Prix Bernard Versele maakt kans . De Zweedse award beloont schrijvers en illustratoren die ' in de geest van Astrid Lindgren werken '. De winnaar strijkt een bedrag van 570.000 euro op. ( belga ) Film'Het vonnis ' start ijzersterkHet vonnis van Jan

In [96]:
waContent_list = evaluationDf.WaContent.tolist()
stanContent_list = evaluationDf.StanContent.tolist()

In [97]:
waFile_list = []
stanFile_list = []
simscorewastan = []
for i in range(0, len(waContent_list)):
    wa_content = waContent_list[i]
    stan_content = stanContent_list[i]
    wa_filename = wa_df.loc[wa_df["content"] == wa_content].filename.tolist()[0]
    waFile_list.append(wa_filename)
    stan_filename = stan_df.loc[stan_df["content"] == stan_content].filename.tolist()[0]
    stanFile_list.append(stan_filename)
    try:
        simscorewastan.append(simDict_Wastan[wa_filename]["Score"][simDict_Wastan[wa_filename]['Standaard'].index(stan_filename)])
    except ValueError:
        simscorewastan.append(0)

In [100]:
evaluationDf['ScoreRobWaStan'] = simscorewastan

In [102]:
with open('manualRatedDfRobbWaStan.pickle', 'wb') as handle:
    pickle.dump(evaluationDf, handle, protocol=pickle.HIGHEST_PROTOCOL)