In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

csv.field_size_limit(sys.maxsize)

131072

In [2]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [3]:
with open("cleansed_train_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112303 entries, 0 to 112302
Data columns (total 7 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   id           112303 non-null  object
 1   drugName     112303 non-null  object
 2   condition    112303 non-null  object
 3   review       112303 non-null  object
 4   rating       112303 non-null  object
 5   date         112303 non-null  object
 6   usefulCount  112303 non-null  object
dtypes: object(7)
memory usage: 6.0+ MB


In [6]:
df["rating"].value_counts()

10    35514
9     19143
1     15094
8     13197
7      6621
5      5585
2      4802
3      4481
6      4407
4      3459
Name: rating, dtype: int64

In [15]:
sample = df.groupby(by=["rating"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.05))
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5615 entries, 95810 to 56819
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           5615 non-null   object
 1   drugName     5615 non-null   object
 2   condition    5615 non-null   object
 3   review       5615 non-null   object
 4   rating       5615 non-null   object
 5   date         5615 non-null   object
 6   usefulCount  5615 non-null   object
dtypes: object(7)
memory usage: 350.9+ KB


In [16]:
sample.sample(2)

Unnamed: 0,id,drugName,condition,review,rating,date,usefulCount
92761,230402,Meloxicam,Inflammatory Conditions,"""My significant other died within 5 days of st...",1,"August 26, 2015",69
53619,113380,Bisacodyl,Constipation,"""I took it right before bed (around 10) I slep...",8,"June 16, 2015",14


In [35]:
sample["rating"].value_counts().sort_index()

1      755
10    1776
2      240
3      224
4      173
5      279
6      220
7      331
8      660
9      957
Name: rating, dtype: int64

In [37]:
np.round((df["rating"].astype(int).value_counts() / df.shape[0]) * 100, 2).sort_index()

1     13.44
2      4.28
3      3.99
4      3.08
5      4.97
6      3.92
7      5.90
8     11.75
9     17.05
10    31.62
Name: rating, dtype: float64

In [39]:
np.round((sample["rating"].astype(int).value_counts() / sample.shape[0]) * 100, 2).sort_index()

1     13.45
2      4.27
3      3.99
4      3.08
5      4.97
6      3.92
7      5.89
8     11.75
9     17.04
10    31.63
Name: rating, dtype: float64

In [20]:
representation = NGram()
features, result = representation.generate_representation(sample["review"].tolist())
result.shape

(5615, 9534)

In [21]:
sys.getsizeof(result)

214133768

In [22]:
sys.getsizeof(result) / (2 ** 30)

0.19942761212587357

In [23]:
cosine_distances(result, result)

array([[0.        , 0.8648496 , 1.        , ..., 1.        , 0.98855263,
        1.        ],
       [0.8648496 , 0.        , 0.9854263 , ..., 1.        , 0.9536495 ,
        0.98339695],
       [1.        , 0.9854263 , 0.        , ..., 0.9806092 , 0.9847667 ,
        1.        ],
       ...,
       [1.        , 1.        , 0.9806092 , ..., 0.        , 0.95522135,
        1.        ],
       [0.98855263, 0.9536495 , 0.9847667 , ..., 0.95522135, 0.        ,
        0.9457799 ],
       [1.        , 0.98339695, 1.        , ..., 1.        , 0.9457799 ,
        0.        ]], dtype=float32)

In [24]:
root_logger = getLogger("root")

In [25]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [26]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 130

In [27]:
aiNet.antibody_population.shape

(130, 9534)

In [28]:
sample.to_csv("sample_cleansed_train_data.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [30]:
with open("sample_cleansed_train_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [31]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5615 entries, 0 to 5614
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           5615 non-null   object
 1   drugName     5615 non-null   object
 2   condition    5615 non-null   object
 3   review       5615 non-null   object
 4   rating       5615 non-null   object
 5   date         5615 non-null   object
 6   usefulCount  5615 non-null   object
dtypes: object(7)
memory usage: 307.2+ KB


In [32]:
df1["score_class"].value_counts()

8     1952
7     1690
6      951
5      300
9      286
4      170
3       64
10      50
2       36
1        7
0        5
Name: score_class, dtype: int64

In [32]:
(df1["review"].isna().astype(int) + df1["review"].isnull().astype(int)).sum()

0