In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
import os

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 10000
random_flag = True
remove_punctuation = True
lemmatization_flag = True
dimension = 8112
feature_selection = 'common_words'

In [5]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [6]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [7]:
from utils.text_processing import get_min_len_corpus

In [8]:
print("Min Length:", get_min_len_corpus(list(df["text"])))

Min Length: 55024


In [9]:
from utils.text_processing import get_corpus, get_random_corpus

In [10]:
texts = list(df['text'])

In [11]:
corpus, segmented_corpus = get_corpus(texts, length_cut)

In [12]:
selected_corpus, words_features, word_index, index_word = get_random_corpus(segmented_corpus, remove_punctuation=remove_punctuation, lemmatization_flag=lemmatization_flag, feature_selection = feature_selection)

In [17]:
len(selected_corpus)

78

In [18]:
labels = list(df['label'])

In [19]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [20]:
y = [dict_categories[y] for y in labels]

In [21]:
total_classes

['Daniel Defoe',
 'Charles Darwin',
 'Charles Dickens',
 'Mark Twain',
 'Bram Stoker',
 'Arthur Conan Doyle',
 'George Eliot',
 'Jane Austen',
 'Hector Hugh',
 'Thomas Hardy',
 'Pelham Grenville',
 'Allan Poe',
 'Joseph Conrad']

In [22]:
import json
def graph2vec(operating_system, networks):
    input_path = "graph2vec/dataset/"+str(length_cut)+"_"+str(dimension)+"/"
    output_path = 'graph2vec/features/'
    try: 
        os.mkdir(input_path)
        os.mkdir(output_path)
    except:
        print("Existe")
    for i, netw in enumerate(networks):
        with open((input_path+str(i)+".json"), "w") as f:
            edges = []
            v_names = netw.vs["name"]
            for edge in netw.get_edgelist():
                (u,v) = edge
                edges.append([int(v_names[u]), int(v_names[v])])
            json.dump({"edges":edges}, f)
    if operating_system == 'linux':
        #path_command = "python struc2vec/src/main.py --dimensions "+str(dimension)+" --input "+input_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".edgelist --output "+output_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".emb"
        path_command = 'python graph2vec/src/graph2vec.py --input-path ' + input_path + ' --output-path ' + output_path+str(length_cut)+'_'+str(dimension)+'.csv' + ' --dimensions ' + str(dimension)
        os.system(path_command)
        print("graph2vec", str(i))

In [23]:
def read_graph2vec(output_path, networks):
    all_network_features = [[] for _ in range(len(networks))]
    with open(output_path+str(length_cut)+"_"+str(dimension)+".csv", "r") as f:
        lines = f.readlines()
        for l in lines[1:]:
            row = l.strip().split(",")
            id_graph = row[0]
            print("id graph", id_graph)
            network_features = np.array([float(v) for v in row[1:]])
            print(str(id_graph), "len",len(network_features))
            all_network_features[int(id_graph)] = network_features
    return np.array(all_network_features)

In [24]:
def get_graph2vec_features(sequences, index_word):
    nets = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path="")
        network = obj.create_network()
        nets.append(network)
    graph2vec("linux", nets)
    X = read_graph2vec('graph2vec/features/', nets)
    return X

In [25]:
X = get_graph2vec_features(selected_corpus, index_word)

Nodes: 2064 - Edges: 6189
Nodes: 2040 - Edges: 6320
Nodes: 2216 - Edges: 6691
Nodes: 2111 - Edges: 6629
Nodes: 1931 - Edges: 6369
Nodes: 1927 - Edges: 6172
Nodes: 2016 - Edges: 6272
Nodes: 2112 - Edges: 6550
Nodes: 2085 - Edges: 6560
Nodes: 2311 - Edges: 6665
Nodes: 2212 - Edges: 6696
Nodes: 1613 - Edges: 5907
Nodes: 2306 - Edges: 6798
Nodes: 1787 - Edges: 6186
Nodes: 2438 - Edges: 6571
Nodes: 1338 - Edges: 5270
Nodes: 2281 - Edges: 6690
Nodes: 2322 - Edges: 6670
Nodes: 1947 - Edges: 6165
Nodes: 2034 - Edges: 6560
Nodes: 2855 - Edges: 6961
Nodes: 1950 - Edges: 6479
Nodes: 1997 - Edges: 6491
Nodes: 2140 - Edges: 6564
Nodes: 1474 - Edges: 4773
Nodes: 2153 - Edges: 6607
Nodes: 2518 - Edges: 6862
Nodes: 1893 - Edges: 6161
Nodes: 2103 - Edges: 6586
Nodes: 2075 - Edges: 6117
Nodes: 1886 - Edges: 6118
Nodes: 2737 - Edges: 7043
Nodes: 2026 - Edges: 5839
Nodes: 1936 - Edges: 6390
Nodes: 1458 - Edges: 5489
Nodes: 1664 - Edges: 6035
Nodes: 1948 - Edges: 6200
Nodes: 2400 - Edges: 6785
Nodes: 2689 

  0%|          | 0/78 [00:00<?, ?it/s]


Feature extraction started.



100%|██████████| 78/78 [00:00<00:00, 78.05it/s]



Optimization started.

GRAPHHHHH Graph with 2016 nodes and 6272 edges
GRAPHHHHH Graph with 1613 nodes and 5907 edges
GRAPHHHHH Graph with 2311 nodes and 6406 edges
GRAPHHHHH Graph with 2273 nodes and 6590 edges
GRAPHHHHH Graph with 2855 nodes and 6961 edges
GRAPHHHHH Graph with 1863 nodes and 6319 edges
GRAPHHHHH Graph with 1947 nodes and 6165 edges
GRAPHHHHH Graph with 2247 nodes and 6621 edges
GRAPHHHHH Graph with 1796 nodes and 5949 edges
GRAPHHHHH Graph with 1992 nodes and 6237 edges
GRAPHHHHH Graph with 2484 nodes and 6717 edges
GRAPHHHHH Graph with 1474 nodes and 4068 edges
GRAPHHHHH Graph with 1844 nodes and 6434 edges
GRAPHHHHH Graph with 1532 nodes and 5787 edges
GRAPHHHHH Graph with 2306 nodes and 6798 edges
GRAPHHHHH Graph with 2363 nodes and 6736 edges
GRAPHHHHH Graph with 2438 nodes and 6571 edges
GRAPHHHHH Graph with 1886 nodes and 6118 edges
GRAPHHHHH Graph with 1913 nodes and 6245 edges
GRAPHHHHH Graph with 2059 nodes and 6711 edges
GRAPHHHHH Graph with 2064 nodes and 

In [26]:
print("Lenght of features:", X.shape)

Lenght of features: (78, 8112)


In [27]:
X

array([[-0.01065547,  0.08509989, -0.07897401, ...,  0.08628312,
        -0.05760979, -0.02948821],
       [ 0.0004353 , -0.02631473, -0.03687038, ..., -0.00676643,
         0.00810497, -0.03290671],
       [-0.03525976,  0.03387215, -0.02880013, ...,  0.00250711,
         0.01901873,  0.01939054],
       ...,
       [-0.02460619,  0.05112893, -0.06208546, ..., -0.01320273,
         0.01484986, -0.02255042],
       [ 0.02429416, -0.0105344 , -0.0589272 , ...,  0.05077451,
        -0.01652109,  0.01531364],
       [-0.00405608,  0.03780188, -0.02789822, ...,  0.01955577,
         0.04780468, -0.12546831]])

In [28]:
len(X[0])

8112

# Classification

In [29]:
import classifierv2

In [30]:
obj = classifierv2.Classification(X, y)
scores = obj.classification()

Score 0.25
Score 0.0625
Score 0.1875
Score 0.0
