In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

csv.field_size_limit(sys.maxsize)

131072

In [3]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [5]:
with open("cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62703 entries, 0 to 62702
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      62703 non-null  object
 1   title   62703 non-null  object
 2   text    62703 non-null  object
 3   label   62703 non-null  object
dtypes: object(4)
memory usage: 1.9+ MB


In [4]:
sample = df.groupby(by=["label"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.10))
sample.sample(2)

Unnamed: 0,id,title,text,label
31247,33629,U.S. welcomes Hariri's return to Lebanon: Stat...,WASHINGTON (Reuters) - The United States welco...,0
62564,71954,Strange People Who Shifted to Other Dimensions,Mysterious Universe \nDo other realities brush...,1


In [5]:
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6270 entries, 9742 to 3423
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6270 non-null   object
 1   title   6270 non-null   object
 2   text    6270 non-null   object
 3   label   6270 non-null   object
dtypes: object(4)
memory usage: 244.9+ KB


In [6]:
sample["label"].value_counts()

0    3462
1    2808
Name: label, dtype: int64

In [7]:
np.round((df["label"].value_counts() / df.shape[0]) * 100, 2)

0    55.21
1    44.79
Name: label, dtype: float64

In [8]:
np.round((sample["label"].value_counts() / sample.shape[0]) * 100, 2)

0    55.22
1    44.78
Name: label, dtype: float64

In [9]:
representation = NGram()
features, result = representation.generate_representation(sample["text"].tolist())
result.shape

(6270, 57155)

In [10]:
cosine_distances(result, result)

array([[0.        , 0.99063635, 0.9092292 , ..., 0.95825356, 0.9775527 ,
        0.9204125 ],
       [0.99063635, 0.        , 0.994514  , ..., 0.98332024, 0.9970965 ,
        0.97896427],
       [0.9092292 , 0.994514  , 0.        , ..., 0.9764874 , 0.9919133 ,
        0.9841107 ],
       ...,
       [0.95825356, 0.98332024, 0.9764874 , ..., 0.        , 0.9960306 ,
        0.98048794],
       [0.9775527 , 0.9970965 , 0.9919133 , ..., 0.9960306 , 0.        ,
        0.9899569 ],
       [0.9204125 , 0.97896427, 0.9841107 , ..., 0.98048794, 0.9899569 ,
        0.        ]], dtype=float32)

In [11]:
root_logger = getLogger("root")

In [12]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [13]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 47

In [14]:
aiNet.antibody_population.shape

(47, 57155)

In [15]:
sample.to_csv("sample_cleansed_train_file.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [16]:
with open("sample_cleansed_train_file.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [17]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6270 entries, 0 to 6269
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      6270 non-null   object
 1   title   6270 non-null   object
 2   text    6270 non-null   object
 3   label   6270 non-null   object
dtypes: object(4)
memory usage: 196.1+ KB


In [19]:
df1["label"].value_counts()

0    3462
1    2808
Name: label, dtype: int64

In [20]:
(df1["text"].isna().astype(int) + df1["text"].isnull().astype(int)).sum()

0