In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

In [2]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [3]:
with open("filtered_cleansed_joined_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32473 entries, 0 to 32472
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            32473 non-null  object
 1   article_id    32473 non-null  object
 2   created       32473 non-null  object
 3   org_category  32473 non-null  object
 4   abstracts     32473 non-null  object
 5   method_id     32473 non-null  object
 6   method        32473 non-null  object
dtypes: object(7)
memory usage: 1.7+ MB


In [5]:
df["method"].value_counts().sort_index()

computer_vision                19812
natural_language_processing     9230
reinforcement_learning          3431
Name: method, dtype: int64

In [6]:
np.round((df["method"].value_counts() / df.shape[0]) * 100, 2).sort_index()

computer_vision                61.01
natural_language_processing    28.42
reinforcement_learning         10.57
Name: method, dtype: float64

In [11]:
sample = df.groupby(by=["method"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.2))
sample.sample(2)

Unnamed: 0,id,article_id,created,org_category,abstracts,method_id,method
12398,401146,1802.07997,2018,educ_only,generating high quality query suggestion candi...,25631,natural_language_processing
30907,518553,2006.11578,2020,educ_only,learning aligned embeddings for semi supervise...,31825,natural_language_processing


In [12]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6494 entries, 23019 to 17920
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            6494 non-null   object
 1   article_id    6494 non-null   object
 2   created       6494 non-null   object
 3   org_category  6494 non-null   object
 4   abstracts     6494 non-null   object
 5   method_id     6494 non-null   object
 6   method        6494 non-null   object
dtypes: object(7)
memory usage: 405.9+ KB


In [13]:
sample["method"].value_counts()

computer_vision                3962
natural_language_processing    1846
reinforcement_learning          686
Name: method, dtype: int64

In [14]:
np.round((df["method"].value_counts() / df.shape[0]) * 100, 2)

computer_vision                61.01
natural_language_processing    28.42
reinforcement_learning         10.57
Name: method, dtype: float64

In [15]:
np.round((sample["method"].value_counts() / sample.shape[0]) * 100, 2)

computer_vision                61.01
natural_language_processing    28.43
reinforcement_learning         10.56
Name: method, dtype: float64

In [16]:
representation = NGram()
features, result =representation.generate_representation(sample["abstracts"].tolist())
result.shape

(6494, 18361)

In [39]:
cosine_distances(result, result)

array([[0.        , 0.9310231 , 0.9799266 , ..., 0.9894676 , 0.9662112 ,
        0.9783742 ],
       [0.9310231 , 0.        , 0.9899555 , ..., 0.97768676, 0.9471673 ,
        0.98890543],
       [0.9799266 , 0.9899555 , 0.        , ..., 0.99020207, 0.99143434,
        0.9765023 ],
       ...,
       [0.9894676 , 0.97768676, 0.99020207, ..., 0.        , 0.9629164 ,
        0.9757997 ],
       [0.9662112 , 0.9471673 , 0.99143434, ..., 0.9629164 , 0.        ,
        0.92421246],
       [0.9783742 , 0.98890543, 0.9765023 , ..., 0.9757997 , 0.92421246,
        0.        ]], dtype=float32)

In [40]:
root_logger = getLogger("root")

In [41]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [42]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 278

In [43]:
aiNet.antibody_population.shape

(278, 18389)

In [21]:
sample.to_csv("sample_filtered_cleansed_joined_data.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [22]:
with open("sample_filtered_cleansed_joined_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [23]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6494 entries, 0 to 6493
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            6494 non-null   object
 1   article_id    6494 non-null   object
 2   created       6494 non-null   object
 3   org_category  6494 non-null   object
 4   abstracts     6494 non-null   object
 5   method_id     6494 non-null   object
 6   method        6494 non-null   object
dtypes: object(7)
memory usage: 355.3+ KB


In [26]:
df1["method"].value_counts()

computer_vision                3962
natural_language_processing    1846
reinforcement_learning          686
Name: method, dtype: int64

In [27]:
(df1["abstracts"].isna().astype(int) + df1["abstracts"].isnull().astype(int)).sum()

0