In [2]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

In [3]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [4]:
with open("cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 611798 entries, 0 to 611797
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            611798 non-null  object
 1   article_id    611798 non-null  object
 2   created       611798 non-null  object
 3   org_category  611798 non-null  object
 4   abstracts     611798 non-null  object
dtypes: object(5)
memory usage: 23.3+ MB


In [5]:
df["org_category"].value_counts().sort_index()

comp_educ_collab     20058
comp_non_educ         1259
comp_only             6673
educ_non_comp       131726
educ_only           452082
Name: org_category, dtype: int64

In [4]:
sample = df.groupby(by=["org_category"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.01))
sample.sample(2)

Unnamed: 0,id,article_id,created,org_category,abstracts
405282,405358,1803.08028,2018,educ_only,sexiphenyl on cu 100 nc afm tip functionalizat...
583160,583260,biorxiv-10.1101/543603,2019,educ_non_comp,unraveling diagnostic biomarkers of schizophre...


In [10]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6119 entries, 102361 to 487261
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            6119 non-null   object
 1   article_id    6119 non-null   object
 2   created       6119 non-null   object
 3   org_category  6119 non-null   object
 4   abstracts     6119 non-null   object
dtypes: object(5)
memory usage: 286.8+ KB


In [6]:
sample["org_category"].value_counts()

educ_only           4521
educ_non_comp       1317
comp_educ_collab     201
comp_only             67
comp_non_educ         13
Name: org_category, dtype: int64

In [7]:
np.round((df["org_category"].value_counts() / df.shape[0]) * 100, 2)

educ_only           73.89
educ_non_comp       21.53
comp_educ_collab     3.28
comp_only            1.09
comp_non_educ        0.21
Name: org_category, dtype: float64

In [8]:
np.round((sample["org_category"].value_counts() / sample.shape[0]) * 100, 2)

educ_only           73.88
educ_non_comp       21.52
comp_educ_collab     3.28
comp_only            1.09
comp_non_educ        0.21
Name: org_category, dtype: float64

In [9]:
representation = NGram()
features, result =representation.generate_representation(sample["abstracts"].tolist())
result.shape

(6119, 27312)

In [11]:
cosine_distances(result, result)

array([[0.        , 0.97438127, 0.94376117, ..., 0.99235386, 0.99655795,
        0.9876183 ],
       [0.97438127, 0.        , 0.9764279 , ..., 0.9852223 , 0.9968798 ,
        0.9867945 ],
       [0.94376117, 0.9764279 , 0.        , ..., 0.9745851 , 0.97932845,
        0.997011  ],
       ...,
       [0.99235386, 0.9852223 , 0.9745851 , ..., 0.        , 1.        ,
        0.99834806],
       [0.99655795, 0.9968798 , 0.97932845, ..., 1.        , 0.        ,
        0.99391735],
       [0.9876183 , 0.9867945 , 0.997011  , ..., 0.99834806, 0.99391735,
        0.        ]], dtype=float32)

In [12]:
root_logger = getLogger("root")

In [13]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [14]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 90

In [43]:
aiNet.antibody_population.shape

(278, 18389)

In [15]:
sample.to_csv("sample_cleansed_data.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [16]:
with open("sample_cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [17]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6119 entries, 0 to 6118
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            6119 non-null   object
 1   article_id    6119 non-null   object
 2   created       6119 non-null   object
 3   org_category  6119 non-null   object
 4   abstracts     6119 non-null   object
dtypes: object(5)
memory usage: 239.2+ KB


In [19]:
df1["org_category"].value_counts()

educ_only           4521
educ_non_comp       1317
comp_educ_collab     201
comp_only             67
comp_non_educ         13
Name: org_category, dtype: int64

In [20]:
(df1["abstracts"].isna().astype(int) + df1["abstracts"].isnull().astype(int)).sum()

0