In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

In [2]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [3]:
with open("filtered_cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15928 entries, 0 to 15927
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        15928 non-null  object
 1   title     15928 non-null  object
 2   abstract  15928 non-null  object
 3   topic     15928 non-null  object
dtypes: object(4)
memory usage: 497.9+ KB


In [4]:
sample = df.groupby(by=["topic"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.4))
sample.sample(2)

Unnamed: 0,id,title,abstract,topic
12612,16597,Semi-supervised Embedding in Attributed Networ...,"In this paper, we propose a novel framework,...",computer science
14383,18940,Simultaneous Multiparty Communication Complexi...,In the Number On the Forehead (NOF) multipar...,computer science


In [5]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6371 entries, 8608 to 4685
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6371 non-null   object
 1   title     6371 non-null   object
 2   abstract  6371 non-null   object
 3   topic     6371 non-null   object
dtypes: object(4)
memory usage: 248.9+ KB


In [10]:
sample["topic"].value_counts()

physics                 2048
computer science        1964
mathematics             1444
statistics               654
quantitative biology     177
quantitative finance      84
Name: topic, dtype: int64

In [11]:
np.round((df["topic"].value_counts() / df.shape[0]) * 100, 2)

physics                 32.14
computer science        30.83
mathematics             22.66
statistics              10.27
quantitative biology     2.78
quantitative finance     1.31
Name: topic, dtype: float64

In [12]:
np.round((sample["topic"].value_counts() / sample.shape[0]) * 100, 2)

physics                 32.15
computer science        30.83
mathematics             22.67
statistics              10.27
quantitative biology     2.78
quantitative finance     1.32
Name: topic, dtype: float64

In [13]:
representation = NGram()
features, result =representation.generate_representation(sample["abstract"].tolist())
result.shape

(6371, 23712)

In [14]:
cosine_distances(result, result)

array([[0.        , 0.9777462 , 0.98847955, ..., 1.        , 0.9392195 ,
        0.9916393 ],
       [0.9777462 , 0.        , 0.9350088 , ..., 0.9948972 , 0.9599305 ,
        0.9833609 ],
       [0.98847955, 0.9350088 , 0.        , ..., 0.9277114 , 0.9599772 ,
        0.96104157],
       ...,
       [1.        , 0.9948972 , 0.9277114 , ..., 0.        , 0.9672066 ,
        0.99350744],
       [0.9392195 , 0.9599305 , 0.9599772 , ..., 0.9672066 , 0.        ,
        0.9494341 ],
       [0.9916393 , 0.9833609 , 0.96104157, ..., 0.99350744, 0.9494341 ,
        0.        ]], dtype=float32)

In [15]:
root_logger = getLogger("root")

In [16]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [17]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 205

In [18]:
aiNet.antibody_population.shape

(205, 23712)

In [20]:
sample.to_csv("sample_filtered_cleansed_data.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [21]:
with open("sample_filtered_cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [22]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6371 entries, 0 to 6370
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        6371 non-null   object
 1   title     6371 non-null   object
 2   abstract  6371 non-null   object
 3   topic     6371 non-null   object
dtypes: object(4)
memory usage: 199.2+ KB


In [24]:
df1["topic"].value_counts()

physics                 2048
computer science        1964
mathematics             1444
statistics               654
quantitative biology     177
quantitative finance      84
Name: topic, dtype: int64

In [25]:
(df1["abstract"].isna().astype(int) + df1["abstract"].isnull().astype(int)).sum()

0