In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

In [2]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [3]:
with open("cleansed_train_file.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52041 entries, 0 to 52040
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   IDLink             52041 non-null  object
 1   Title              52041 non-null  object
 2   Headline           52041 non-null  object
 3   Source             52041 non-null  object
 4   Topic              52041 non-null  object
 5   PublishDate        52041 non-null  object
 6   Facebook           52041 non-null  object
 7   GooglePlus         52041 non-null  object
 8   LinkedIn           52041 non-null  object
 9   SentimentTitle     52041 non-null  object
 10  SentimentHeadline  52041 non-null  object
dtypes: object(11)
memory usage: 4.4+ MB


In [4]:
df["Topic"].value_counts().sort_index()

economy      18939
microsoft    12132
obama        15808
palestine     5162
Name: Topic, dtype: int64

In [5]:
np.round((df["Topic"].value_counts() / df.shape[0]) * 100, 2).sort_index()

economy      36.39
microsoft    23.31
obama        30.38
palestine     9.92
Name: Topic, dtype: float64

In [4]:
sample = df.groupby(by=["Topic"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.12))
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6245 entries, 40029 to 11135
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   IDLink             6245 non-null   object
 1   Title              6245 non-null   object
 2   Headline           6245 non-null   object
 3   Source             6245 non-null   object
 4   Topic              6245 non-null   object
 5   PublishDate        6245 non-null   object
 6   Facebook           6245 non-null   object
 7   GooglePlus         6245 non-null   object
 8   LinkedIn           6245 non-null   object
 9   SentimentTitle     6245 non-null   object
 10  SentimentHeadline  6245 non-null   object
dtypes: object(11)
memory usage: 585.5+ KB


In [5]:
sample.sample(2)

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline
25124,bcARglJdaA,High Court Will Hear Microsoft Appeal Over Xbo...,The Supreme Court will decide whether Microsof...,New York Times,microsoft,2016-01-15 13:48:00,0,0,0,0.189583333333333,-0.0739509972887452
9451,bjUzYccNlm,Saving Europe's Economy From Going Down ...,Philippe Legrain is the author of European Spr...,The Brussels Times,economy,2015-12-03 17:08:18,1,0,0,-0.089623257515895,-0.0761123511904762


In [15]:
sample["Topic"].value_counts().sort_index()

economy      2273
microsoft    1456
obama        1897
palestine     619
Name: Topic, dtype: int64

In [16]:
np.round((df["Topic"].value_counts() / df.shape[0]) * 100, 2).sort_index()

economy      36.39
microsoft    23.31
obama        30.38
palestine     9.92
Name: Topic, dtype: float64

In [8]:
np.round((sample["Topic"].value_counts() / sample.shape[0]) * 100, 2)

economy      36.40
obama        30.38
microsoft    23.31
palestine     9.91
Name: Topic, dtype: float64

In [None]:
representation = NGram()
features, result =representation.generate_representation(sample["Headline"].tolist())
result.shape

In [17]:
cosine_distances(result, result)

array([[0.        , 0.98551255, 0.98605776, ..., 0.9635516 , 1.        ,
        1.        ],
       [0.98551255, 0.        , 0.9862035 , ..., 1.        , 1.        ,
        1.        ],
       [0.98605776, 0.9862035 , 0.        , ..., 1.        , 0.9812703 ,
        0.9547632 ],
       ...,
       [0.9635516 , 1.        , 1.        , ..., 0.        , 1.        ,
        0.96716624],
       [1.        , 1.        , 0.9812703 , ..., 1.        , 0.        ,
        1.        ],
       [1.        , 1.        , 0.9547632 , ..., 0.96716624, 1.        ,
        0.        ]], dtype=float32)

In [18]:
root_logger = getLogger("root")

In [19]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [20]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 100

In [21]:
aiNet.antibody_population.shape

(100, 14072)

In [9]:
sample.to_csv("sample_cleansed_train_file.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [10]:
with open("sample_cleansed_train_file.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6245 entries, 0 to 6244
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   IDLink             6245 non-null   object
 1   Title              6245 non-null   object
 2   Headline           6245 non-null   object
 3   Source             6245 non-null   object
 4   Topic              6245 non-null   object
 5   PublishDate        6245 non-null   object
 6   Facebook           6245 non-null   object
 7   GooglePlus         6245 non-null   object
 8   LinkedIn           6245 non-null   object
 9   SentimentTitle     6245 non-null   object
 10  SentimentHeadline  6245 non-null   object
dtypes: object(11)
memory usage: 536.8+ KB


In [12]:
df1["Topic"].value_counts()

economy      2273
obama        1897
microsoft    1456
palestine     619
Name: Topic, dtype: int64

In [14]:
(df1["Headline"].isna().astype(int) + df1["Headline"].isnull().astype(int)).sum()

0