In [1]:
from network import CNetwork

In [2]:
import pandas as pd
import numpy as np

In [3]:
import os

In [4]:
name_dataset = "dataset_1"
dataset_path = "datasetsv2/"
length_cut = 10000
random_flag = True
remove_punctuation = True
lemmatization_flag = True

In [5]:
df = pd.read_csv(dataset_path + name_dataset + ".csv")

In [6]:
df.head(5)

Unnamed: 0,label,text,book
0,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Christine Gehring,...",Right Ho Jeeves
1,Pelham Grenville,"\n\n\n\n\n\n\n\nProduced by Suzanne L. Shell, ...",Tales Of St Austin
2,Joseph Conrad,And that last\n\nword was the single word of ...,Victory
3,Joseph Conrad,\n\n\n\nIf I have ever had these gifts in any ...,Under Western Eyes
4,Bram Stoker,\n\n\n\n Lond...,The Lady Of The Shroud


In [7]:
from utils.text_processing import get_min_len_corpus

In [8]:
print("Min Length:", get_min_len_corpus(list(df["text"])))

Min Length: 55024


In [9]:
from utils.text_processing import get_corpus, get_random_corpus

In [10]:
texts = list(df['text'])

In [11]:
corpus, segmented_corpus = get_corpus(texts, length_cut)

In [12]:
selected_corpus, words_features, word_index, index_word = get_random_corpus(segmented_corpus, remove_punctuation=remove_punctuation, lemmatization_flag=lemmatization_flag)

In [13]:
len(selected_corpus)

78

In [14]:
labels = list(df['label'])

In [15]:
total_classes = list(set(labels))  ## or author
print("Total classes: {}".format(len(total_classes)))
number_books = (df[df['label'] == total_classes[0]]).shape[0]
print("Total entities for each class in train: {}".format(number_books))
dict_categories = {cat: index for index, cat in enumerate(total_classes)}

Total classes: 13
Total entities for each class in train: 6


In [16]:
y = [dict_categories[y] for y in labels]

In [17]:
total_classes

['Hector Hugh',
 'Charles Dickens',
 'Joseph Conrad',
 'George Eliot',
 'Allan Poe',
 'Arthur Conan Doyle',
 'Charles Darwin',
 'Thomas Hardy',
 'Pelham Grenville',
 'Bram Stoker',
 'Daniel Defoe',
 'Mark Twain',
 'Jane Austen']

In [18]:
import json
def motifs(operating_system, networks):
    input_path = "motifs/dataset/"+str(length_cut)+"/"
    output_path = 'motifs/features/'
    try: 
        os.mkdir(input_path)
        os.mkdir(output_path)
    except:
        print("Existe")
    for i, netw in enumerate(networks):
        netw.write_pajek(input_path+str(i)+".paj")
    for i, netw in enumerate(networks):
        if operating_system == 'linux':
            #path_command = "python struc2vec/src/main.py --dimensions "+str(dimension)+" --input "+input_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".edgelist --output "+output_path+str(length_cut)+"_"+str(dimension)+"_"+str(i)+".emb"
            path_command = 'python motifs/get_frequencies.py -r ' + input_path + str(i) + '.paj' + ' > ' + output_path+str(length_cut)+"_"+str(i)+'.emb'
            os.system(path_command)
            print("motifs", str(i))

In [57]:
def read_motifs(output_path, networks):
    all_network_features = [[] for _ in range(len(networks))]
    for i in range(len(networks)):
        with open(output_path+str(length_cut)+"_"+str(i)+".emb", "r") as f:
            motifs_features = f.read().replace("\n","").replace("[","").replace("]","").split(",")
            motifs_features = [float(v) for v in motifs_features]
            print(motifs_features)
            
            network_features = np.array(motifs_features)
            print(str(i), "len",len(network_features))
            all_network_features[i] = network_features
    return np.array(all_network_features)

In [60]:
def get_motifs_features(sequences, index_word):
    nets = []
    for text in sequences:
        obj = CNetwork(text, model=None, index_word=index_word, percentages=None, path="")
        network = obj.create_network(directed=True)
        nets.append(network)
    motifs("linux", nets)
    X = read_motifs('motifs/features/', nets)
    return X

In [None]:
X = get_motifs_features(selected_corpus, index_word)

Nodes: 1905 - Edges: 6075
Nodes: 1978 - Edges: 6361
Nodes: 1858 - Edges: 6388
Nodes: 2158 - Edges: 6742
Nodes: 1778 - Edges: 6267
Nodes: 1779 - Edges: 6286
Nodes: 1940 - Edges: 6440
Nodes: 1847 - Edges: 6455
Nodes: 1716 - Edges: 6272
Nodes: 2253 - Edges: 6609
Nodes: 2136 - Edges: 6520
Nodes: 1376 - Edges: 5675
Nodes: 2234 - Edges: 6992
Nodes: 1632 - Edges: 6322
Nodes: 2179 - Edges: 6624
Nodes: 1362 - Edges: 5571
Nodes: 2024 - Edges: 6576
Nodes: 2253 - Edges: 6620
Nodes: 2246 - Edges: 6843
Nodes: 2076 - Edges: 6881
Nodes: 2650 - Edges: 7028
Nodes: 1877 - Edges: 6583
Nodes: 1718 - Edges: 6319
Nodes: 2053 - Edges: 6617
Nodes: 1410 - Edges: 4786
Nodes: 2168 - Edges: 6780
Nodes: 2096 - Edges: 6605
Nodes: 2200 - Edges: 6788
Nodes: 2083 - Edges: 6783
Nodes: 2002 - Edges: 6208
Nodes: 1632 - Edges: 5796
Nodes: 2425 - Edges: 6834
Nodes: 1989 - Edges: 5897
Nodes: 1919 - Edges: 6731
Nodes: 1425 - Edges: 5847
Nodes: 1876 - Edges: 6569
Nodes: 1889 - Edges: 6461
Nodes: 2170 - Edges: 6787
Nodes: 2555 

In [52]:
print("Lenght of features:", X.shape)

Lenght of features: (78, 13)


In [53]:
X

array([[8.69840e+04, 1.56643e+05, 1.64950e+04, ..., 3.16000e+02,
        2.18000e+02, 1.80000e+01],
       [8.95340e+04, 1.70983e+05, 1.48080e+04, ..., 2.06000e+02,
        9.70000e+01, 1.50000e+01],
       [9.78830e+04, 1.65057e+05, 1.80960e+04, ..., 3.18000e+02,
        2.22000e+02, 1.20000e+01],
       ...,
       [9.93340e+04, 1.87523e+05, 1.85770e+04, ..., 3.12000e+02,
        1.35000e+02, 9.00000e+00],
       [1.15348e+05, 1.87443e+05, 1.38590e+04, ..., 1.73000e+02,
        6.80000e+01, 2.00000e+00],
       [1.24832e+05, 2.31795e+05, 2.45870e+04, ..., 2.91000e+02,
        1.56000e+02, 1.60000e+01]])

In [54]:
len(X[0])

13

# Classification

In [55]:
import classifierv2

In [56]:
obj = classifierv2.Classification(X, y)
scores = obj.classification()

Score 0.25
Score 0.1875
Score 0.25
Score 0.3125
