In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

csv.field_size_limit(sys.maxsize)

131072

In [None]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [None]:
with open("cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

In [None]:
sample = df.groupby(by=["score_class"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.30))
sample.info()

In [12]:
sample.sample(2)

Unnamed: 0,id,review_title,review,url,best_new_music,author,author_type,score,score_class
3562,18831,drilluminati 2,As Chicago’s King L continues to orbit around ...,http://pitchfork.com/reviews/albums/18831-king...,0,jordan sargent,,6.7,7
10409,11224,grand archives,\r\n I'll give this to Sub Pop: They certai...,http://pitchfork.com/reviews/albums/11224-gran...,0,ian cohen,contributor,7.8,8


In [13]:
sample["score_class"].value_counts()

8     1952
7     1690
6      951
5      300
9      286
4      170
3       64
10      50
2       36
1        7
0        5
Name: score_class, dtype: int64

In [14]:
np.round((df["score_class"].value_counts() / df.shape[0]) * 100, 2)

8     35.42
7     30.66
6     17.26
5      5.44
9      5.19
4      3.07
3      1.16
10     0.91
2      0.66
1      0.13
0      0.10
Name: score_class, dtype: float64

In [15]:
np.round((sample["score_class"].value_counts() / sample.shape[0]) * 100, 2)

8     35.42
7     30.67
6     17.26
5      5.44
9      5.19
4      3.08
3      1.16
10     0.91
2      0.65
1      0.13
0      0.09
Name: score_class, dtype: float64

In [17]:
representation = NGram()
features, result = representation.generate_representation(sample["review"].tolist())
result.shape

(5511, 65847)

In [20]:
sys.getsizeof(result)

1451531396

In [21]:
sys.getsizeof(result) / (2 ** 30)

1.3518439568579197

In [22]:
cosine_distances(result, result)

array([[0.        , 0.95929897, 0.9650641 , ..., 0.9608364 , 0.969087  ,
        0.96520764],
       [0.95929897, 0.        , 0.9397638 , ..., 0.9552146 , 0.9545004 ,
        0.9536209 ],
       [0.9650641 , 0.9397638 , 0.        , ..., 0.9578414 , 0.93697447,
        0.7263733 ],
       ...,
       [0.9608364 , 0.9552146 , 0.9578414 , ..., 0.        , 0.9624095 ,
        0.96179396],
       [0.969087  , 0.9545004 , 0.93697447, ..., 0.9624095 , 0.        ,
        0.9427093 ],
       [0.96520764, 0.9536209 , 0.7263733 , ..., 0.96179396, 0.9427093 ,
        0.        ]], dtype=float32)

In [23]:
root_logger = getLogger("root")

In [24]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [25]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 142

In [26]:
aiNet.antibody_population.shape

(142, 65847)

In [27]:
sample.to_csv("sample_cleansed_data.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [28]:
with open("sample_cleansed_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [31]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5511 entries, 0 to 5510
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              5511 non-null   object
 1   review_title    5511 non-null   object
 2   review          5511 non-null   object
 3   url             5511 non-null   object
 4   best_new_music  5511 non-null   object
 5   author          5511 non-null   object
 6   author_type     5511 non-null   object
 7   score           5511 non-null   object
 8   score_class     5511 non-null   object
dtypes: object(9)
memory usage: 387.6+ KB


In [32]:
df1["score_class"].value_counts()

8     1952
7     1690
6      951
5      300
9      286
4      170
3       64
10      50
2       36
1        7
0        5
Name: score_class, dtype: int64

In [33]:
(df1["review"].isna().astype(int) + df1["review"].isnull().astype(int)).sum()

0