In [None]:
import yaml
import pandas as pd
from tqdm import tqdm
from yaspin import yaspin
from yaspin.spinners import Spinners
import ast
import swifter
from datetime import datetime
from socialvec.socialvec import SocialVec
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

tqdm.pandas()

from aux_functions import *

In [None]:

config = "config.yaml"

with open(config, 'r') as file:
    conf = yaml.load(file, Loader=yaml.FullLoader)


In [None]:
# read and arrange data
with yaspin(Spinners.arc, text="Reading Data") as sp:
    data_for_training = pd.read_csv(conf['data_file'])
    data_for_training.reset_index(inplace=True)
    data_for_training.drop('index', axis=1, inplace=True)
    data_for_training.drop(data_for_training.columns[data_for_training.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

    data_for_training = data_for_training[data_for_training['source']==conf["source_for_modeling"]]

print("Parsing Lists")
data_for_training['list'] = data_for_training.progress_apply(lambda x: fix_list(x), axis=1)
data_for_training.list = data_for_training.list.swifter.apply(lambda x: list(ast.literal_eval(x.strip())))

In [None]:
if conf["use_existing_train_test_split"] == True:

    print("Using existing train/test split ✅✅")
    df_test_set = pd.read_excel(conf['train_test_split_file'])
    data_for_training = data_for_training.merge(df_test_set, on='twitter_id', how='left')
else:

    print("Creating a new train/test split ‼️‼️")
    # Split the data into train and test sets, stratified by a specific column
    train_df, test_df = train_test_split(data_for_training,
                                         test_size=0.2,
                                         stratify=data_for_training[conf['field_to_classify']])

    # Create a new column 'dataset' and initialize with 'train' for all rows
    data_for_training.loc[data_for_training.index.isin(train_df.index), 'train_test'] = 'train'

    # Use the loc accessor to update the 'dataset' column for the test set rows
    data_for_training.loc[data_for_training.index.isin(test_df.index), 'train_test'] = 'test'

    date_string = datetime.now().strftime("%Y%m%d")
    data_for_training[['twitter_id','train_test']].to_excel(f'../Data/train_test_split_{date_string}.xlsx', index=False)


sv = SocialVec(conf["SocialVec_version"])
data_for_training[['socialvec', 'socialvec_len']] = data_for_training.progress_apply(lambda x: sv.get_average_embeddings(x['list']), axis=1, result_type='expand' )

In [None]:
#data_for_training.drop('political_train_test', axis=1, inplace=True)

In [None]:
le = LabelEncoder()
data_for_training['class'] = le.fit_transform(data_for_training[conf['field_to_classify']])
data_for_training = data_for_training[data_for_training['socialvec_len']>conf['minimal_socialvec_len']]
print (f"number of samples after filtering by SocialVec len: {data_for_training.shape[0]}")

In [None]:
train_df = data_for_training[data_for_training['train_test']=='train'].copy()
test_df = data_for_training[data_for_training['train_test']=='test'].copy()

In [None]:
X_train, y_train     = prep_tf_inputs(train_df, 'socialvec')
X_test,  y_test      = prep_tf_inputs(test_df, 'socialvec')

In [None]:
#todo: try label smoothing

In [None]:
def label_smoothing_loss(y_true, y_pred, smoothing=0.1):
    """
    Custom loss function implementing label smoothing.
    """
    num_classes = y_true.shape[-1]
    smooth_positives = 1.0 - smoothing
    smooth_negatives = smoothing / num_classes
    y_true = y_true * smooth_positives + smooth_negatives

    return keras.losses.categorical_crossentropy(y_true, y_pred)


## Model

In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.metrics import Precision

tf.keras.backend.clear_session()

input_shape = (X_train.shape[1],)

# Create the model
model = keras.Sequential()

# Add a Dense layer with 64 units and ReLU activation
model.add(layers.Dense(64, activation='relu', input_shape=input_shape))

# Add a Dense layer with 2 output neurons (representing the two classes) and softmax activation
model.add(layers.Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
                loss= label_smoothing_loss, #'categorical_crossentropy',
                metrics=[Precision()],
                run_eagerly=True)

# Print the model summary
model.summary()

In [None]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

# One-dimensional array containing zeros and ones
data = y_train

# Reshape the data to a 2D array with a single feature
data_2d = data.reshape(-1, 1)

# Initialize the OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# Fit and transform the data
one_hot_encoded = tf.constant(encoder.fit_transform(data_2d))

In [None]:
model.fit(X_train,
              one_hot_encoded,
              epochs=100,
              batch_size=50,
              validation_split=0.1,
              verbose=True)

In [None]:
preds_full = model.predict(X_test)


In [None]:
test_preds = tf.argmax(model.predict(X_test), axis=1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, test_preds))

In [None]:
# data = y_test
# data_2d = data.reshape(-1, 1)
# encoder = OneHotEncoder(sparse=False)
# y_test = tf.constant(encoder.fit_transform(data_2d))
# y_test = tf.argmax(y_test, axis=1)

In [None]:
test_df['preds'] = test_preds
test_df['confidence'] = np.max((preds_full), axis=1)

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
# y_test  : actual labels or target
# y_preds : predicted labels or target
sns.heatmap(confusion_matrix(y_test, test_preds),square=True, annot=True, cmap='Blues', fmt='d', cbar=False);

## Save Model

In [None]:
date_string = datetime.now().strftime("%Y%m%d")
model.save(f"{conf["field_to_classify"]}_model_{date_string}.h5")

## Save the wrong predictions for debug

In [None]:
test_df_debug = test_df[['twitter_id', 'list', conf["field_to_classify"], 'source', 'train_test',
       'socialvec_len', 'class', 'preds', 'confidence']].copy()

test_df_debug.to_csv('test_df_debug.csv.gz', compression='gzip')