In [8]:
import os
import sys
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.svm import OneClassSVM
from sklearn.metrics import f1_score

In [6]:
dataset_source='D:/Pablo/clases/UJM/2. Semester, 2021/Mining Uncertain Social Networks/Repository/Experiments/datasets/dancer_01/'

In [9]:
results_rows = []
for file_name in os.listdir(dataset_source+'results/'):
    if file_name == 'base.txt':
        continue
    dataset = pd.read_csv(dataset_source+'results/'+file_name)
    dataset['seal_likelihood'] = dataset['seal_likelihood'].fillna(0)

    # Split into train/test
    X_train, X_test, E_train, E_test, Y_train,Y_test = train_test_split(dataset[['asim_likelihood','seal_likelihood']], dataset['edge_exists_original'], dataset['edge_exists_modified'],
                                             test_size=0.3, random_state=4269, stratify=dataset['edge_exists_modified'])

    # 'Train', Train a decision tree with both likelihood functions
    #clf = OneClassSVM()
    clf = tree.DecisionTreeClassifier()
    clf = clf.fit(X_train, Y_train)

    # Evaluate F1 score all over
    train_f1_score = f1_score(clf.predict(X_train).astype('float64'), Y_train)
    test_f1_score = f1_score(clf.predict(X_test).astype('float64'), Y_test)
    # Comparing to the original graph (just the test ones)
    exists_test_f1_score = f1_score(clf.predict(X_test).astype('float64'), E_test)
    # Comparing to the original graph (ALL OF THEM!)
    exists_all_f1_score = f1_score(clf.predict(pd.concat([X_train, X_test])).astype('float64'), 
                               np.concatenate((E_train,E_test)))

    # To save the results of all the training
    results_rows.append({'file_name':file_name, 'train_f1_score':train_f1_score, 'test_f1_score':test_f1_score, 'exists_f1_score':exists_test_f1_score, 'exists_all_f1_score':exists_all_f1_score})

    dataset['tree_result'] = clf.predict(dataset[['asim_likelihood','seal_likelihood']])
    
    if file_name != 'base.txt':
        # Creating graph only adding new edges, with weight 1 for all
        selected_edges = dataset[(dataset['edge_exists_modified']==1) | ((dataset['tree_result']>0.5))].copy()
        selected_edges[['from','to']].to_csv(dataset_source+'resulting_graphs/add_edges/'+file_name, sep=' ', header=None, index=False)

    else:
        # Creating graph as the base graph only
        selected_edges = dataset[(dataset['edge_exists_modified']==1)].copy()
        selected_edges[['from','to']].to_csv(dataset_source+'resulting_graphs/add_edges/'+file_name, sep=' ', header=None, index=False)

threshold_results = pd.DataFrame(results_rows)
threshold_results.to_csv(dataset_source+'thresholds_scores.csv', index=False)


In [10]:
threshold_results

Unnamed: 0,file_name,train_f1_score,test_f1_score,exists_f1_score,exists_all_f1_score
0,bet_asc_01.txt,0.999583,0.147073,0.154711,0.706384
1,bet_asc_02.txt,0.999765,0.129421,0.136128,0.658177
2,bet_asc_03.txt,0.96285,0.11585,0.114241,0.586631
3,bet_asc_04.txt,0.999843,0.152,0.154659,0.579374
4,bet_asc_05.txt,0.990313,0.160786,0.140372,0.515678
5,bet_asc_06.txt,0.999765,0.181124,0.141261,0.46676
6,bet_asc_07.txt,0.999052,0.268613,0.176218,0.415507
7,bet_asc_08.txt,0.999513,0.47973,0.234032,0.371173
8,bet_asc_09.txt,0.998875,0.836066,0.29552,0.341505
9,bet_desc_01.txt,0.976183,0.135069,0.135066,0.686049
