In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

csv.field_size_limit(sys.maxsize)

131072

In [2]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [None]:
with open("cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

In [7]:
df[df["category"].str.match("esg", case=False)].iloc[0].title

'The coronavirus epidemic: Implications for markets'

In [5]:
sample = df.groupby(by=["category"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.01))
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4756 entries, 260427 to 149895
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   original_id  4756 non-null   object
 1   title        4756 non-null   object
 2   content      4756 non-null   object
 3   category     4756 non-null   object
dtypes: object(4)
memory usage: 185.8+ KB


In [6]:
sample.sample(2)

Unnamed: 0,original_id,title,content,category
17744,17803,Rolling report: Impact of coronavirus on pro r...,"Get access to more than 30 brands, premium vid...",general
324573,326048,Minneapolis on trial: how the year of George F...,"At 9.52pm on 8 March 2020, my university sent ...",general


In [7]:
sample["category"].value_counts()

business    2631
general     1502
tech         522
science       81
esg           20
Name: category, dtype: int64

In [8]:
np.round((df["category"].value_counts() / df.shape[0]) * 100, 2)

business    55.31
general     31.59
tech        10.97
science      1.70
esg          0.43
Name: category, dtype: float64

In [9]:
np.round((sample["category"].value_counts() / sample.shape[0]) * 100, 2)

business    55.32
general     31.58
tech        10.98
science      1.70
esg          0.42
Name: category, dtype: float64

In [10]:
del df

In [12]:
del data

In [13]:
del reader

In [14]:
representation = NGram()
features, result = representation.generate_representation(sample["content"].tolist())
result.shape

(4756, 64752)

In [15]:
sys.getsizeof(result)

1231842176

In [16]:
2 ** 30

1073741824

In [17]:
(2 ** 10) ** 3

1073741824

In [18]:
sys.getsizeof(result) / (2 ** 30)

1.1472424268722534

In [19]:
cosine_distances(result, result)

array([[0.        , 0.9962292 , 0.9926793 , ..., 0.99515617, 0.95879716,
        0.9930818 ],
       [0.9962292 , 0.        , 0.9499531 , ..., 0.9853188 , 0.9894974 ,
        0.9642602 ],
       [0.9926793 , 0.9499531 , 0.        , ..., 0.9651548 , 0.9879954 ,
        0.9165024 ],
       ...,
       [0.99515617, 0.9853188 , 0.9651548 , ..., 0.        , 0.98443395,
        0.96122915],
       [0.95879716, 0.9894974 , 0.9879954 , ..., 0.98443395, 0.        ,
        0.9670766 ],
       [0.9930818 , 0.9642602 , 0.9165024 , ..., 0.96122915, 0.9670766 ,
        0.        ]], dtype=float32)

In [20]:
root_logger = getLogger("root")

In [21]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [22]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 46

In [23]:
aiNet.antibody_population.shape

(46, 64752)

In [25]:
sample.to_csv("sample_cleansed_data.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [3]:
with open("sample_cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [4]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4756 entries, 0 to 4755
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   original_id  4756 non-null   object
 1   title        4756 non-null   object
 2   content      4756 non-null   object
 3   category     4756 non-null   object
dtypes: object(4)
memory usage: 148.8+ KB


In [5]:
df1["category"].value_counts()

business    2631
general     1502
tech         522
science       81
esg           20
Name: category, dtype: int64

In [6]:
(df1["content"].isna().astype(int) + df1["content"].isnull().astype(int)).sum()

0