In [1]:
import os
import csv
import sys
from pathlib import Path

sys.path.append(
    os.path.join(
        Path(os.getcwd()).parents[3],
        "src"
    )
)

csv.field_size_limit(sys.maxsize)

131072

In [2]:
import numpy as np
import pandas as pd
from models import AiNet
from representations import NGram
from logging import getLogger
from utils import cosine_distances, euclidean_distances

In [3]:
with open("cleansed_mapped_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
    data = list(reader)
    header = data.pop(0)

df = pd.DataFrame(data=data, columns=header)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111507 entries, 0 to 111506
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   id                       111507 non-null  object
 1   author                   111507 non-null  object
 2   statement                111507 non-null  object
 3   target                   111507 non-null  object
 4   BinaryNumTarget          111507 non-null  object
 5   manual_keywords          111507 non-null  object
 6   tweet                    111507 non-null  object
 7   5_label_majority_answer  111507 non-null  object
 8   5_truthfulness           111507 non-null  object
 9   3_label_majority_answer  111507 non-null  object
 10  3_truthfulness           111507 non-null  object
dtypes: object(11)
memory usage: 9.4+ MB


In [6]:
df["3_truthfulness"].value_counts().sort_index()

False    54132
True     57375
Name: 3_truthfulness, dtype: int64

In [5]:
df["5_truthfulness"].value_counts().sort_index()

False           26606
Mostly False    27526
Mostly True     29124
True            28251
Name: 5_truthfulness, dtype: int64

In [7]:
np.round((df["5_truthfulness"].value_counts() / df.shape[0]) * 100, 2).sort_index()

False           23.86
Mostly False    24.69
Mostly True     26.12
True            25.34
Name: 5_truthfulness, dtype: float64

In [9]:
sample = df.groupby(by=["5_truthfulness"], as_index=False, group_keys=False).apply(lambda c: c.sample(frac=0.05))
sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5575 entries, 67610 to 54757
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       5575 non-null   object
 1   author                   5575 non-null   object
 2   statement                5575 non-null   object
 3   target                   5575 non-null   object
 4   BinaryNumTarget          5575 non-null   object
 5   manual_keywords          5575 non-null   object
 6   tweet                    5575 non-null   object
 7   5_label_majority_answer  5575 non-null   object
 8   5_truthfulness           5575 non-null   object
 9   3_label_majority_answer  5575 non-null   object
 10  3_truthfulness           5575 non-null   object
dtypes: object(11)
memory usage: 522.7+ KB


In [10]:
sample.sample(2)

Unnamed: 0,id,author,statement,target,BinaryNumTarget,manual_keywords,tweet,5_label_majority_answer,5_truthfulness,3_label_majority_answer,3_truthfulness
57210,68981,Sue Owen,Nearly 40 percent of his McAllen-area constitu...,True,1,"40 percent, obesity",@MarkOkanagan @bigfatsurprise @amyalkon @DrJAs...,Agree,True,Agree,True
90618,109321,Tom Kertscher,If you were forced to use a Sharpie to fill ou...,False,0,"Sharpie, voter fraud",@awardsaa APOLLO JUSTICE ARIZONA ELECTION BALL...,Mostly Agree,Mostly False,Agree,False


In [23]:
sample["3_truthfulness"].value_counts().sort_index()

False    2706
True     2869
Name: 3_truthfulness, dtype: int64

In [12]:
sample["5_truthfulness"].value_counts()

Mostly True     1456
True            1413
Mostly False    1376
False           1330
Name: 5_truthfulness, dtype: int64

In [13]:
np.round((df["3_truthfulness"].value_counts() / df.shape[0]) * 100, 2)

True     51.45
False    48.55
Name: 3_truthfulness, dtype: float64

In [14]:
np.round((df["5_truthfulness"].value_counts() / df.shape[0]) * 100, 2)

Mostly True     26.12
True            25.34
Mostly False    24.69
False           23.86
Name: 5_truthfulness, dtype: float64

In [15]:
np.round((sample["3_truthfulness"].value_counts() / sample.shape[0]) * 100, 2)

True     51.46
False    48.54
Name: 3_truthfulness, dtype: float64

In [16]:
np.round((sample["5_truthfulness"].value_counts() / sample.shape[0]) * 100, 2)

Mostly True     26.12
True            25.35
Mostly False    24.68
False           23.86
Name: 5_truthfulness, dtype: float64

In [12]:
representation = NGram()
features, result = representation.generate_representation(sample["tweet"].tolist())
result.shape

(11151, 26380)

In [13]:
sys.getsizeof(result)

1176653648

In [14]:
sys.getsizeof(result) / (2 ** 30)

1.095844104886055

In [15]:
cosine_distances(result, result)

array([[0.        , 1.        , 0.9714923 , ..., 1.        , 1.        ,
        0.98369694],
       [1.        , 0.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.9714923 , 1.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        0.98115957],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        0.9528623 ],
       [0.98369694, 1.        , 1.        , ..., 0.98115957, 0.9528623 ,
        0.        ]], dtype=float32)

In [16]:
root_logger = getLogger("root")

In [17]:
aiNet = AiNet(distance_method="cosine", logger=root_logger)

In [19]:
aiNet.fit(
    antigen_population=result, 
    max_iter=15, 
    number_of_antibodies=int(result.shape[0] * 0.10), 
    clone_multiplier=10, 
    no_best_cells_taken_each_selection=5, 
    percent_clones_reselected=0.12, 
    pruning_threshold=0.5
)

iter: 1 | cur_hyper_rate:  0.00000000 | |██████----------------------------------------------------------------------------------------------| 6.7% | avd: 0.00000 | net size: 0

In [20]:
aiNet.antibody_population.shape

(0,)

In [17]:
sample.to_csv("sample_cleansed_mapped_data.csv", encoding="utf-8", index=False, sep=",", quoting=csv.QUOTE_NONNUMERIC, quotechar='"')

In [18]:
with open("sample_cleansed_mapped_data.csv", "r+", newline="", encoding="utf-8") as f:
    reader = csv.reader(f, delimiter=",", quoting=csv.QUOTE_MINIMAL, quotechar='"')
    data = list(reader)
    header = data.pop(0)

df1 = pd.DataFrame(data=data, columns=header)

In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5575 entries, 0 to 5574
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   id                       5575 non-null   object
 1   author                   5575 non-null   object
 2   statement                5575 non-null   object
 3   target                   5575 non-null   object
 4   BinaryNumTarget          5575 non-null   object
 5   manual_keywords          5575 non-null   object
 6   tweet                    5575 non-null   object
 7   5_label_majority_answer  5575 non-null   object
 8   5_truthfulness           5575 non-null   object
 9   3_label_majority_answer  5575 non-null   object
 10  3_truthfulness           5575 non-null   object
dtypes: object(11)
memory usage: 479.2+ KB


In [20]:
df1["3_truthfulness"].value_counts()

True     2869
False    2706
Name: 3_truthfulness, dtype: int64

In [21]:
df1["5_truthfulness"].value_counts()

Mostly True     1456
True            1413
Mostly False    1376
False           1330
Name: 5_truthfulness, dtype: int64

In [25]:
(df1["tweet"].isna().astype(int) + df1["tweet"].isnull().astype(int)).sum()

0