In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

In [2]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [3]:
with open("cleansed_labelled_train_file.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40406 entries, 0 to 40405
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   IDLink             40406 non-null  object
 1   Title              40406 non-null  object
 2   Headline           40406 non-null  object
 3   Source             40406 non-null  object
 4   Topic              40406 non-null  object
 5   PublishDate        40406 non-null  object
 6   Facebook           40406 non-null  object
 7   GooglePlus         40406 non-null  object
 8   LinkedIn           40406 non-null  object
 9   SentimentTitle     40406 non-null  object
 10  SentimentHeadline  40406 non-null  object
 11  label              40406 non-null  object
dtypes: object(12)
memory usage: 3.7+ MB


In [4]:
df["label"].value_counts().sort_index()

Facebook      32975
GooglePlus     1281
LinkedIn       6150
Name: label, dtype: int64

In [5]:
np.round((df["label"].value_counts() / df.shape[0]) * 100, 2).sort_index()

Facebook      81.61
GooglePlus     3.17
LinkedIn      15.22
Name: label, dtype: float64

In [4]:
sample = df.groupby(by=["label"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.15))
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6060 entries, 35420 to 34299
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   IDLink             6060 non-null   object
 1   Title              6060 non-null   object
 2   Headline           6060 non-null   object
 3   Source             6060 non-null   object
 4   Topic              6060 non-null   object
 5   PublishDate        6060 non-null   object
 6   Facebook           6060 non-null   object
 7   GooglePlus         6060 non-null   object
 8   LinkedIn           6060 non-null   object
 9   SentimentTitle     6060 non-null   object
 10  SentimentHeadline  6060 non-null   object
 11  label              6060 non-null   object
dtypes: object(12)
memory usage: 615.5+ KB


In [5]:
sample.sample(2)

Unnamed: 0,IDLink,Title,Headline,Source,Topic,PublishDate,Facebook,GooglePlus,LinkedIn,SentimentTitle,SentimentHeadline,label
11809,G01ModYUV0,Ingram Micro joins Microsoft cloud programme i...,Ingram Micro announced that it has joined Micr...,Telecompaper (subscription),microsoft,2015-12-21 09:57:23,0,0,1,-0.0668402777777777,-0.0168869717050765,LinkedIn
1247,nqqALiJIkv,"Microsoft, GM, Ford among several corporations...",Automotive and technology firms including Ford...,WDIV Detroit,microsoft,2015-11-15 21:26:16,8,1,1,0.0,-0.0430154501007303,Facebook


In [14]:
sample["label"].value_counts().sort_index()

Facebook      4946
GooglePlus     192
LinkedIn       922
Name: label, dtype: int64

In [7]:
np.round((df["label"].value_counts() / df.shape[0]) * 100, 2)

Facebook      81.61
LinkedIn      15.22
GooglePlus     3.17
Name: label, dtype: float64

In [8]:
np.round((sample["label"].value_counts() / sample.shape[0]) * 100, 2)

Facebook      81.62
LinkedIn      15.21
GooglePlus     3.17
Name: label, dtype: float64

In [22]:
representation = NGram()
features, result =representation.generate_representation(sample["Headline"].tolist())
result.shape

(10102, 13621)

In [23]:
cosine_distances(result, result)

array([[0.        , 0.9313711 , 1.        , ..., 0.96771014, 1.        ,
        1.        ],
       [0.9313711 , 0.        , 1.        , ..., 0.97608423, 1.        ,
        0.9490532 ],
       [1.        , 1.        , 0.        , ..., 1.        , 0.9849643 ,
        1.        ],
       ...,
       [0.96771014, 0.97608423, 1.        , ..., 0.        , 1.        ,
        0.8994009 ],
       [1.        , 1.        , 0.9849643 , ..., 1.        , 0.        ,
        1.        ],
       [1.        , 0.9490532 , 1.        , ..., 0.8994009 , 1.        ,
        0.        ]], dtype=float32)

In [12]:
root_logger = getLogger("root")

In [12]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [13]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 68

In [14]:
aiNet.antibody_population.shape

(68, 13698)

In [9]:
sample.to_csv("sample_cleansed_labelled_train_file.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')

In [10]:
with open("sample_cleansed_labelled_train_file.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6060 entries, 0 to 6059
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   IDLink             6060 non-null   object
 1   Title              6060 non-null   object
 2   Headline           6060 non-null   object
 3   Source             6060 non-null   object
 4   Topic              6060 non-null   object
 5   PublishDate        6060 non-null   object
 6   Facebook           6060 non-null   object
 7   GooglePlus         6060 non-null   object
 8   LinkedIn           6060 non-null   object
 9   SentimentTitle     6060 non-null   object
 10  SentimentHeadline  6060 non-null   object
 11  label              6060 non-null   object
dtypes: object(12)
memory usage: 568.3+ KB


In [12]:
df1["label"].value_counts()

Facebook      4946
LinkedIn       922
GooglePlus     192
Name: label, dtype: int64

In [13]:
(df1["Headline"].isna().astype(int) + df1["Headline"].isnull().astype(int)).sum()

0