In [1]:
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from gensim.models.keyedvectors import KeyedVectors
from nltk.stem.snowball import SpanishStemmer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from datetime import datetime
import lxml.etree as ET
import seaborn as sns
import pandas as pd
import numpy as np
import regex as re
import itertools
import unidecode
import spacy
import html
import os

In [2]:
'''
INPUT_FILE_PATH
- path containing at least the columns: tweet_id, case_id, checked
if two different tweet_ids have the same case_id, then both articles are about 
the same case checked columns indicates whether that 
    * tweet_id has been explored (1) 
    * has been randomly assigned a case_id (0)
    * has been explored but the answer is not clear (2)
    
NEWS_PATH
- directory where articles are collected in different folders and following the 
NewsML-G2 format (xml)

OUTPUT_PATH
- path where to store a csv file containing pairwise similarity metrics and probability
of being about the same case for all the news in NEWS_PATH
'''

INPUT_FILE_PATH = '../data/cases_labeled.csv'
NEWS_PATH = '../data/news/'
OUTPUT_PATH = '../data/'
FASTTEXT_W2V_PATH = 'utilities/embeddings-l-model.vec'

DEBUG = False

In [3]:
def listdir_checked(path, unwanted = ['.DS_Store']):
    '''
    Discard unwanted files or directories when listing the elements in a given path
    '''
    return (f for f in os.listdir(path) if f not in unwanted)


def normalize_string(to_normalize, encoded = False):
    '''
    Normalize text given a string
    '''
    text = str(to_normalize).lower()  # lowering text
    if encoded: 
        text = ' '.join([html.unescape(term) for term in text.split()])
    text = unidecode.unidecode(text)

    text = re.sub(r'[^\w\s]', '', text)  # removing all the punctuations
    last_text = text.split()  # tokenize the text

    # remove stopwords
    stopwords_set = set(stopwords.words("spanish"))
    last_text = ' '.join([x for x in last_text if (x not in stopwords_set)])
    return last_text


def normalize_text(array_of_str):
    '''
    Normalize arrays of strings
    '''
    final_array = []
    for text in array_of_str:
        normalized = normalize_string(text)
        if normalized != '': final_array.append(normalized)
    return final_array


def add_similarity_column(pairs_df, mapping_keys, similarity_matrix, column_name):
    '''
    Arguments:
     * pairs_df
         - pd.DataFrame
         - contains pairs of tweet_ids --> column names: [tweet_id_A, tweet_id_B]
     * mapping_keys
         - dictionary
         - key: tweet_id, value: position
     * similarity_matrix
         - np.matrix
         - symmetrical matrix with the similarity between tweets
     * column_name
         - string
         - name of the new column to be added in pairs_df
    '''
    similarity_pairs = []               # Create list with the same order as pairs_df
    for i, row in pairs_df.iterrows():
        tid_A = row['tweet_id_A']       # Obtain tweet id
        tid_B = row['tweet_id_B']       # Get the position of each article in the matrix
        pos_A = mapping_keys[tid_A]
        pos_B = mapping_keys[tid_B]
        similarity_pairs.append(similarity_matrix[pos_A, pos_B])             # Order similarity following pairs_df order
    pairs_df.insert(len(pairs_df.columns), column_name, similarity_pairs)   # Add new column
    return pairs_df


def create_articles_dictionary(NEWS_PATH):
    '''
    Import articles information.
    Articles are stored in directories in the NEWS_PATH.
    '''
    data = {}               # keys: media, value: list of dictionaries with info about the news articles of the given media
    unique_urls = []        # list to store unique urls to discard repeated ones
    repeated_data = {}      # store repeated articles following the same format as 'data' dictionary

    for directory in listdir_checked(NEWS_PATH):
        for file in listdir_checked(NEWS_PATH + directory):
            full_path = NEWS_PATH + directory + '/' + file
            # Read xml file - info stored following NewsML-G2 format
            root = ET.parse(full_path).getroot()
            # Parse news
            media = file.rsplit('_', 1)[0]
            # Check repeated urls
            url = root.findall(".//infoSource")[0].get("uri")
            str_date = root.findall('.//contentMeta')[0].find('contentCreated').text[:10]
            info = {
                'id': file.split(':')[-1].replace('.xml', ''),
                'media': media,
                'publication_date': datetime.strptime(str_date, '%Y-%m-%d'),
                'title': normalize_string(root.findall('.//itemRef')[0].find('title').text, encoded = True),
                'headline': normalize_string(root.findall(".//itemRef")[0].find('description').text.strip(), encoded = True),
                'article': normalize_string(root.findall('.//itemRef')[1].find('description').text.strip(), encoded = True),                'url': url
            }

            if url not in unique_urls:
                unique_urls.append(url)
                try:
                    data[media].append(info)
                except:
                    data[media] = [info]

            else:
                try:
                    repeated_data[media].append(info)
                except:
                    repeated_data[media] = [info]
    return data, repeated_data


def classify_entities(doc):
    '''
    Given an nlp doc, returns:
    * A dictonary with the entities grouped by type: 'PER', 'LOC', 'ORG' and 'MISC'
    * A list with all the entities
    '''
    classif_dict = {}
    classif_list = []
    for ent in doc.ents:
        try:
            classif_list.append(ent)
            classif_dict[ent.label_].append(ent)
        except:
            classif_dict[ent.label_] = [ent]
    # Check that the dict has all the keys
    for key in ['PER', 'LOC', 'ORG', 'MISC']:
        if key not in classif_dict.keys():
            classif_dict[key] = None

    return classif_dict, classif_list


def Goodall1_similarity_matrix(entities_lists, DEBUG=False):
    '''
    Computes the similarity matrix between different documents given a list of lists.
    Each list should contain the categorical variables (strings) that represent the document.
    The distance is computed using the Goodall1 measure.

    Returns a NxN matrix with similarities (being N the number of documents == length of entities_lists).
    '''

    # NORMALIZE AND OBTAIN UNIQUE ENTITIES
    normalized_lists = [normalize_text(array) for array in entities_lists]
    unique_entities = list(set([entity for document in normalized_lists for entity in document]))

    N = len(entities_lists)  # number of documents
    d = len(unique_entities)  # number of unique_enitities


    # COMPUTE SQUARED FREQUENCY FOR EACH ENTITTY
    freq_2 = np.zeros(d)  # Squared frequency -> p*p-1 / N*N-1
    for i, entity in enumerate(unique_entities):
        freq = 0
        for document in normalized_lists:
            for element in document:
                freq += element.count(entity)
        freq_2[i] = (freq * (freq - 1)) / (N * (N - 1))

    # COMPUTE SIMILARITY MATRIX
    similarity_matrix = np.zeros(shape=(N, N))
    # Iterate for every pair of documents
    for i in range(0, N):
        similarity_matrix[i, i] = 1     # put 1 in the diagonal
        docA = normalized_lists[i]      # list of normalized entities of document A
        for j in range(i + 1, N):
            docB = normalized_lists[j]  # list of normalized entities of document B
            similarity = 0
            # Itereate over each entity
            for k in range(d):
                entity = unique_entities[k]
                if docA.count(entity) > 0 and docB.count(entity) > 0: # if an entity is in both documents, increase similarity
                    similarity += (1 / d) * (1 - freq_2[k])
            # Fill symmetrical similarity matrix
            similarity_matrix[i, j] = similarity
            similarity_matrix[j, i] = similarity


    return similarity_matrix


def jaccard_coefficient_matrix(entities_lists):
    # NORMALIZE THE LIST OF ENTITIES
    normalized_lists = [normalize_text(array) for array in entities_lists]
    N = len(normalized_lists)

    # CREATE SIMILARITY MATRIX
    similarity_matrix = np.zeros(shape=(N, N))
    # Iterate for every pair of documents
    for i in range(N):
        similarity_matrix[i, i] = 1
        docA = normalized_lists[i]      # list of normalized entities of document A
        for j in range(i + 1, N):
            docB = normalized_lists[j]  # list of normalized entities of document B
            # Compute jaccard similarity
            intersection = len(list(set(docA).intersection(docB)))
            union = (len(docA) + len(docB)) - intersection
            try:
                jaccard = float(intersection / union)
            except: 
                jaccard = 0
            # Fill similarity matrix
            similarity_matrix[i, j] = jaccard
            similarity_matrix[j, i] = jaccard

    return similarity_matrix


def length_difference_matrix(data, mapping_keys):
    N = len(mapping_keys)
    article_length = np.zeros(N)
    # Get length (number of terms) of each article
    for media, new in data.items():
        for element in new:
            try:
                pos = mapping_keys[element['id']]
                article_length[pos] = len(element['article'])
            except:
                pass
    # Compute absolute difference of length for each pair of articles
    length_diff_matrix = np.zeros(shape=(N, N))
    for i in range(N):
        for j in range(i + 1, N):
            diff = abs(article_length[i] - article_length[j])
            length_diff_matrix[i, j] = diff
            length_diff_matrix[j, i] = diff

    return length_diff_matrix


def cosine_similarity_matrix(data, mapping_keys):
    # GET NEWS ARTICLES
    N = len(mapping_keys)
    articles = [None] * N
    for media, new in data.items():
        for element in new:
            try:
                pos = mapping_keys[element['id']]
                articles[pos] = normalize_string(element['title'] + ' ' + 
                                                 element['headline'] + ' ' + 
                                                 element['article'])
            except:
                pass

    # COMPUTE TF-IDF
    # Stem
    stemmer = SpanishStemmer(ignore_stopwords=False)
    for i, article in enumerate(articles):
        articles[i] = str([stemmer.stem(word) for word in article.split()])
    # Compute tf-idf
    stopwords_spanish = [word.encode().decode('utf-8') for word in stopwords.words('spanish')] # Remove stopwords
    vectorizer = TfidfVectorizer(stop_words=stopwords_spanish)
    X = vectorizer.fit_transform(articles)

    # COMPUTE COSINE SIMILARITY MATRIX
    cosine_sim_matrix = np.zeros(shape=(N, N))
    for i in range(N):
        cosine_sim_matrix[i,i] = 1
        for j in range(i + 1, N):
            similarity = cosine_similarity(X[i], X[j])[0][0]
            cosine_sim_matrix[i, j] = similarity
            cosine_sim_matrix[j, i] = similarity

    return cosine_sim_matrix


def publication_difference_matrix(data, mapping_keys):
    N = len(mapping_keys)
    publication_dates = [None] * N
    # Get publication dates
    for media, new in data.items():
        for element in new:
            try:
                pos = mapping_keys[element['id']]
                publication_dates[pos] = element['publication_date']
            except:
                pass
    # Compute difference matrix
    dates_diff_matrix = np.zeros(shape=(N, N))
    for i in range(N):
        for j in range(i + 1, N):
            diff = abs((publication_dates[i] - publication_dates[j]).days)
            dates_diff_matrix[i, j] = diff
            dates_diff_matrix[j, i] = diff

    return dates_diff_matrix

def cosine_similarity_BERT(data, mapping_keys, key='title'):                          
    N = len(mapping_keys)
    articles = [None] * N
    for media, new in data.items():
        for element in new:
            try:
                pos = mapping_keys[element['id']]
                #title = normalize_string(element['title'])
                #description = normalize_string(element['headline'])
                articles[pos] = normalize_string(element[key])
            except:
                pass
   
    BERT_model = SentenceTransformer('dccuchile/bert-base-spanish-wwm-uncased')
    encoded = BERT_model.encode(articles)
    cosine_sim_matrix = np.zeros(shape=(N, N))

    for i in range(N):
        cosine_sim_matrix[i,i] = 1
        for j in range(i + 1, N):
    
            similarity = cosine_similarity(encoded[i].reshape(1,-1), encoded[j].reshape(1,-1))[0][0]
            cosine_sim_matrix[i, j] = similarity
            cosine_sim_matrix[j, i] = similarity
                          
    return cosine_sim_matrix 

def wmdistance(data, mapping_keys):
    N = len(mapping_keys)
    articles = [None] * N
    
    for media, new in data.items():
        for element in new:
            try:
                pos = mapping_keys[element['id']]
                #title = normalize_string(element['title'])
                #description = normalize_string(element['headline'])
                articles[pos] = normalize_string(element['title'])+' '+normalize_string(element['headline'].split('.')[0])
            except:
                pass
    
    wordvectors_fasttext_file = FASTTEXT_W2V_PATH
    wordvectors = KeyedVectors.load_word2vec_format(wordvectors_fasttext_file)
    wmd_matrix = np.zeros(shape=(N,N))
    
    for i in range(N):
        for j in range(i + 1, N):
            distance = wordvectors.wmdistance(articles[i], articles[j])
            wmd_matrix[i,j] = distance
            wmd_matrix[j,i] = distance
            
    return wmd_matrix
    

def train_log_reg(df, id_columns, target_column):
    # split training / test data
    df = df.drop(columns=id_columns)
    training_data, testing_data = train_test_split(df, random_state=2000, test_size=0.1)
    # get labels
    Y_train = training_data[target_column].values
    Y_test = testing_data[target_column].values

    X_train = training_data.drop(columns=[target_column])
    X_test = testing_data.drop(columns=[target_column])

    # logistic regression classifier
    scikit_log_reg = LogisticRegression(verbose=1, solver='liblinear', C=5, penalty='l2', max_iter=1000)

    model = scikit_log_reg.fit(X_train, Y_train.astype(int))
    score = model.score(X_test, Y_test.astype(int))
    print("Testing accuracy of LogReg = ", score)
    return (model)

In [4]:
def create_features(data, cases_df, pairs_df):
    # EXTRACT ENTITIES
    '''
    Entities will be stored two arrays:
    * entities - all entities
    * summary_entities - entities from the title and head (summary)
    Auxiliary variables:
    * mapping_keys - dict with key: tweet id -> value: absolute position in entities and summary_entities
    '''
    mapping_keys = {}  # key: tweet id -> value: absolute position in all_entities
    counter = 0

    summary_entities = []
    article_entities = []

    nlp = spacy.load("es_core_news_sm")

    for media, new in data.items():
        for element in new:
            mapping_keys[element['id']] = counter
            # Get entities from each part of the new
            try:
                title_doc = nlp(element['title'])
                headline_doc = nlp(element['headline'])
                article_doc = nlp(element['article'])
            except:
                if DEBUG:
                    print(f"Problem extracting entities from article identified with tweet_id = {element['id']}")
                pass
                          
            # Classify entities
            title_dict, title_list = classify_entities(title_doc)
            headline_dict, headline_list = classify_entities(headline_doc)
            article_dict, article_list = classify_entities(article_doc)
            
            # Add entities into data
            element['title_entities'] = title_dict
            element['headline_entities'] = headline_dict
            element['article_entities'] = article_dict
            
            # Store into entities and summary_entities array
            summary_entities.append(title_list + headline_list)
            article_entities.append(article_list)
            
            counter += 1
    
         
    # COMPUTE GOODALL1 SIMILARITY BETWEEN ENTITIES IN THE SUMMARY (title + summary) OF EACH PAIR OF ARTICLE
    goodall1_matrix = Goodall1_similarity_matrix(summary_entities)
    pairs_df = add_similarity_column(pairs_df, mapping_keys, goodall1_matrix, 'goodall1_summary')

    # COMPUTE GOODALL1 SIMILARITY BETWEEN ENTITIES IN THE ARTICLE  OF EACH PAIR OF ARTICLE
    goodall1_matrix = Goodall1_similarity_matrix(article_entities)
    pairs_df = add_similarity_column(pairs_df, mapping_keys, goodall1_matrix, 'goodall1_article')

                          
    # COMPUTE JACCARD COEFFICIENT BETWEEN SUMMARY ENTITIES OF EACH PAIR OF ARTICLES
    jaccard_matrix = jaccard_coefficient_matrix(summary_entities)
    pairs_df = add_similarity_column(pairs_df, mapping_keys, jaccard_matrix, 'jaccard_summary')
                                     
    # COMPUTE JACCARD COEFFICIENT BETWEEN ALL ENTITIES OF EACH PAIR OF ARTICLES
    jaccard_matrix = jaccard_coefficient_matrix(article_entities)
    pairs_df = add_similarity_column(pairs_df, mapping_keys, jaccard_matrix, 'jaccard_article')

                          
    # COMPUTE COSINE SIMILARITY OF TF-IDF FOR EACH PAIR OF ARTICLES
    cosine_matrix = cosine_similarity_matrix(data, mapping_keys) 
    pairs_df = add_similarity_column(pairs_df, mapping_keys, cosine_matrix, 'tf-idf')
     
                          
    # COMPUTE COSINE SIMILARITY OF BERT WORD EMBEDDING
    BERT_similarity_t = cosine_similarity_BERT(data, mapping_keys)
    pairs_df = add_similarity_column(pairs_df, mapping_keys, BERT_similarity_t, 'BETO_title')

                          
    # COMPUTE WORD MOVER'S DISTANCE OF FASTTEXT EMBEDDINGS OF TITLES
    wmd_matrix = wmdistance(data, mapping_keys)
    pairs_df = add_similarity_column(pairs_df, mapping_keys, wmd_matrix, 'word_movers_dist')
          
                          
    # COMPUTE ABSOLUTE DIFFERENCE PUBLICATION DATES IN DAYS
    publication_matrix = publication_difference_matrix(data, mapping_keys)
    pairs_df = add_similarity_column(pairs_df, mapping_keys, publication_matrix, 'publication_diff')
    
    # COMPUTE ABSOLUTE DIFFERENCE OF ARTICLES LENGTH BETWEEN EACH PAIR OF ARTICLES
    length_matrix = length_difference_matrix(data, mapping_keys)
    pairs_df = add_similarity_column(pairs_df, mapping_keys, length_matrix, 'length_diff')

    return pairs_df
                          

# Import data and create pairwise features

In [5]:
# IMPORT ARTICLES INFORMATION
data, repeated_data = create_articles_dictionary(NEWS_PATH)

if DEBUG:
    for media, news in repeated_data.items():
        print(f'''* Media {media} has {len(news)} repeated elements.''')

# IMPORT LABELS FROM CSV FILE
'''
Csv file containing information about each article using their corresponding tweet_id as an identifier.
Columns = [tweet_id, media, checked, case_id, title, headline, url]
* tweet_id - identifier
* media
* checked - whether the tweet_id has been supervised - values: 0/1
* case_id - an integer id attributed to each article, articles about the same case have the same case_id
* title
* headline
* url 
'''
cases_df = pd.read_csv(INPUT_FILE_PATH, sep=';')
cases_df['tweet_id'] = cases_df['tweet_id'].apply(lambda x: str(x))

# Get tweet ids of tweets not supervised
if 'checked' in cases_df.keys():
    checked_tweet_id = cases_df[cases_df['checked'] == 1]['tweet_id'].values

# Remove repeated articles
repeated_articles_tid = []
for media, news in repeated_data.items():
    for element in news:
        repeated_articles_tid.append(element['id'])
cases_df = cases_df[~cases_df['tweet_id'].isin(repeated_articles_tid)]

# CREATE ALL POSSIBLE PAIRS OF ARTICLES + ADD TARGET VARIABLE (same_case)
pairs_df = pd.DataFrame(columns=['tweet_id_A', 'tweet_id_B', 'same_case'])
for pair in list(itertools.combinations(cases_df.tweet_id, 2)):
    case_1 = int(cases_df[cases_df['tweet_id'] == pair[0]]['case_id'])
    case_2 = int(cases_df[cases_df['tweet_id'] == pair[1]]['case_id'])

    if case_1 == case_2:
        same_case = 1
    else:
        same_case = 0

    pairs_df = pairs_df.append({'tweet_id_A': pair[0], 'tweet_id_B': pair[1], 'same_case': same_case}, ignore_index=True)

if DEBUG:
    print(f"{(pairs_df['same_case'] == 1).sum() / len(pairs_df) * 100}% of positive cases")
    print(f"{(pairs_df['same_case'] == 1).sum()} / {len(pairs_df)}: same cases / total # pairs")
    print(f"{len(cases_df)} total number of tweets")

In [6]:
# CREATE FEATURES
pairs_df = create_features(data, cases_df, pairs_df)

In [7]:
pairs_df

Unnamed: 0,tweet_id_A,tweet_id_B,same_case,goodall1_summary,goodall1_article,jaccard_summary,jaccard_article,tf-idf,BETO_title,word_movers_dist,publication_diff,length_diff
0,1287463951375179776,1288437861973471232,0,0.0,0.0,0.0,0.0,0.039036,0.740956,0.826905,4.0,2570.0
1,1287463951375179776,1288541048520744962,0,0.0,0.0,0.0,0.0,0.113098,0.740605,0.930301,3.0,428.0
2,1287463951375179776,1288632903275106304,0,0.0,0.0,0.0,0.0,0.017701,0.605755,0.803036,3.0,251.0
3,1287463951375179776,1288919794729836544,0,0.0,0.0,0.0,0.0,0.036161,0.694141,0.842197,4.0,887.0
4,1287463951375179776,1288764366863708161,0,0.0,0.0,0.0,0.0,0.030533,0.711093,0.885106,4.0,920.0
...,...,...,...,...,...,...,...,...,...,...,...,...
122755,1303099478501666818,1303718556308303878,0,0.0,0.0,0.0,0.0,0.060689,0.709489,0.860457,2.0,451.0
122756,1303099478501666818,1303696679800012801,0,0.0,0.0,0.0,0.0,0.051923,0.718233,0.781999,2.0,296.0
122757,1304387150507651072,1303718556308303878,0,0.0,0.0,0.0,0.0,0.041908,0.684583,0.788712,2.0,28.0
122758,1304387150507651072,1303696679800012801,0,0.0,0.0,0.0,0.0,0.121317,0.687701,0.806449,2.0,127.0


# Train a logistic regression model 
GOAL: Identify pairs of news articles about the same case 

In [8]:
# TRAIN A LOGISTIC REGRESSION CLASSIFIER
# Filter only checked cases for training
supervised_pairs_df = pairs_df[pairs_df['same_case'] != 2]  # 2 has been used for ambiguous cases
supervised_pairs_df = supervised_pairs_df[supervised_pairs_df['tweet_id_A'].isin(checked_tweet_id)]
supervised_pairs_df = supervised_pairs_df[supervised_pairs_df['tweet_id_B'].isin(checked_tweet_id)]

# Train the model
log_reg_model = train_log_reg(supervised_pairs_df, id_columns=['tweet_id_A', 'tweet_id_B'], target_column='same_case')

# PREDICT FOR ALL PAIRS OF NEWS
prediction = log_reg_model.predict(pairs_df.drop(columns=['tweet_id_A', 'tweet_id_B', 'same_case']))
proba_prediction = log_reg_model.predict_proba(pairs_df.drop(columns=['tweet_id_A', 'tweet_id_B', 'same_case']))[:,1]
pairs_df.insert(len(pairs_df.columns), 'same_case_pred', prediction)
pairs_df.insert(len(pairs_df.columns), 'same_case_pred_proba', proba_prediction)

[LibLinear]Testing accuracy of LogReg =  0.9983356449375866


In [9]:
pairs_df

Unnamed: 0,tweet_id_A,tweet_id_B,same_case,goodall1_summary,goodall1_article,jaccard_summary,jaccard_article,tf-idf,BETO_title,word_movers_dist,publication_diff,length_diff,same_case_pred,same_case_pred_proba
0,1287463951375179776,1288437861973471232,0,0.0,0.0,0.0,0.0,0.039036,0.740956,0.826905,4.0,2570.0,0,0.001117
1,1287463951375179776,1288541048520744962,0,0.0,0.0,0.0,0.0,0.113098,0.740605,0.930301,3.0,428.0,0,0.003478
2,1287463951375179776,1288632903275106304,0,0.0,0.0,0.0,0.0,0.017701,0.605755,0.803036,3.0,251.0,0,0.000501
3,1287463951375179776,1288919794729836544,0,0.0,0.0,0.0,0.0,0.036161,0.694141,0.842197,4.0,887.0,0,0.000820
4,1287463951375179776,1288764366863708161,0,0.0,0.0,0.0,0.0,0.030533,0.711093,0.885106,4.0,920.0,0,0.000705
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
122755,1303099478501666818,1303718556308303878,0,0.0,0.0,0.0,0.0,0.060689,0.709489,0.860457,2.0,451.0,0,0.001279
122756,1303099478501666818,1303696679800012801,0,0.0,0.0,0.0,0.0,0.051923,0.718233,0.781999,2.0,296.0,0,0.001188
122757,1304387150507651072,1303718556308303878,0,0.0,0.0,0.0,0.0,0.041908,0.684583,0.788712,2.0,28.0,0,0.000897
122758,1304387150507651072,1303696679800012801,0,0.0,0.0,0.0,0.0,0.121317,0.687701,0.806449,2.0,127.0,0,0.004441


In [10]:
pairs_df.to_csv(f'{OUTPUT_DATA}/cases_pariwise_proba.csv', index=False)