In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

In [2]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger

In [3]:
with open("filtered_cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37768 entries, 0 to 37767
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   terms        37768 non-null  object
 1   major_terms  37768 non-null  object
 2   titles       37768 non-null  object
 3   abstracts    37768 non-null  object
dtypes: object(4)
memory usage: 1.2+ MB


In [4]:
sample = df.groupby(by=["major_terms"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.15))
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5665 entries, 35361 to 19562
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   terms        5665 non-null   object
 1   major_terms  5665 non-null   object
 2   titles       5665 non-null   object
 3   abstracts    5665 non-null   object
dtypes: object(4)
memory usage: 221.3+ KB


In [5]:
sample.sample(2)

Unnamed: 0,terms,major_terms,titles,abstracts
31199,['cs.CV'],['cs'],Low-Rank Tensor Completion by Truncated Nuclea...,"Currently, low-rank tensor completion has gain..."
11509,"['cs.LG', 'cs.AI']",['cs'],Multi-dimensional Parametric Mincuts for Const...,"In this paper, we propose novel algorithms for..."


In [6]:
sample["major_terms"].value_counts()

['cs']            3728
['cs', 'stat']    1632
['cs', 'eess']     305
Name: major_terms, dtype: int64

In [7]:
np.round((df["major_terms"].value_counts() / df.shape[0]) * 100, 2)

['cs']            65.80
['cs', 'stat']    28.81
['cs', 'eess']     5.39
Name: major_terms, dtype: float64

In [8]:
np.round((sample["major_terms"].value_counts() / sample.shape[0]) * 100, 2)

['cs']            65.81
['cs', 'stat']    28.81
['cs', 'eess']     5.38
Name: major_terms, dtype: float64

In [8]:
representation = NGram()
features, result =representation.generate_representation(sample["abstracts"].tolist())
result.shape

(7554, 19483)

In [9]:
root_logger = getLogger("root")

In [10]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [None]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.7,
    suppression_threshold=0.12
)

In [13]:
aiNet.antibody_population.shape

(297, 19483)

In [18]:
sample["abstracts"].isna().astype(int).sum()

0

In [19]:
sample["abstracts"].isnull().astype(int).sum()

0

In [9]:
sample.to_csv("sample_filtered_cleansed_data.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [10]:
with open("sample_filtered_cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5665 entries, 0 to 5664
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   terms        5665 non-null   object
 1   major_terms  5665 non-null   object
 2   titles       5665 non-null   object
 3   abstracts    5665 non-null   object
dtypes: object(4)
memory usage: 177.2+ KB


In [12]:
df1["major_terms"].value_counts()

['cs']            3728
['cs', 'stat']    1632
['cs', 'eess']     305
Name: major_terms, dtype: int64

In [14]:
df1["abstracts"].isna().astype(int).sum() + df1["abstracts"].isnull().astype(int).sum()

0