In [None]:
import os
import ast
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import swifter
from tqdm import tqdm
import yaml

from socialvec.socialvec import SocialVec

tqdm.pandas()

In [None]:

config = "config.yaml"

with open(config, 'r') as file:
    conf = yaml.load(file, Loader=yaml.FullLoader)


In [None]:
conf

In [None]:
data_path = "../../Twitter/Data/Volkova/"

if conf['field_to_classify'] == 'political':
    data_file = "volkova_SocialVec_political_augmented_2020.csv"
    df_volkova = pd.read_csv(os.path.join(data_path,data_file),index_col=0)
    df_volkova = df_volkova[df_volkova['source']=='volkova']
    df_volkova['class'] = df_volkova['political'].apply(lambda x: 0 if x == "Democrat" else 1)
    
elif conf['field_to_classify'] == 'gender':
    data_file = "volkova_dataset_with_followees.csv"
    attributes_file = "Volkova_all_arrtibues.csv"
    df_volkova = pd.read_csv(os.path.join(data_path,data_file),index_col=0)
    df_volkova_attribues = pd.read_csv(os.path.join(data_path,attributes_file),index_col=0)
    
    df_volkova = pd.merge(df_volkova, df_volkova_attribues, left_on = 'user',right_on='user_id')
    df_volkova = df_volkova[['user', 'screen_name','name','description','list', conf['field_to_classify']]]
    df_volkova.rename(columns={'user':'twitter_id'},inplace=True)
    df_volkova['source']='volkova'
    df_volkova['class'] = df_volkova['gender'].apply(lambda x: 0 if x == "Female" else 1)

## Load and manipulate data

In [None]:
def fix_list(row):
    if row['source']=='volkova':
        return "[" + row['list'] + "]"
    else:
        return row['list']

df_volkova['list'] = df_volkova.progress_apply(lambda x: fix_list(x),axis=1)
df_volkova.list = df_volkova.list.swifter.apply(lambda x: list(ast.literal_eval(x.strip())))


In [None]:
sv = SocialVec(conf['SocialVec_version'])
df_volkova[['socialvec', 'socialvec_len']] = df_volkova.progress_apply(lambda x: sv.get_average_embeddings(x['list']), axis=1, result_type='expand' )

In [None]:
print(f"size before filtering: {df_volkova.shape[0]}")
df_volkova = df_volkova[df_volkova["socialvec_len"]>conf["minimal_socialvec_len"]]
print(f"size after filtering: {df_volkova.shape[0]}")

## register our custom loss function and load model

In [None]:
def label_smoothing_loss(y_true, y_pred, smoothing=0.1):
    """
    Custom loss function implementing label smoothing.
    """
    num_classes = y_true.shape[-1]
    smooth_positives = 1.0 - smoothing
    smooth_negatives = smoothing / num_classes
    y_true = y_true * smooth_positives + smooth_negatives

    return keras.losses.categorical_crossentropy(y_true, y_pred)

custom_objects = {'label_smoothing_loss': label_smoothing_loss}
with keras.utils.custom_object_scope(custom_objects):
    # Load the saved model
    
    model = keras.models.load_model(f"models/{conf['model_name']}.h5")

In [None]:
def prep_tf_inputs(df_subset, field):
    values = np.stack(df_subset[field].to_numpy())
    X = tf.constant(values)
    y = df_subset['class'].values
    return X,y

In [None]:
X_volkova, y_volkova = prep_tf_inputs(df_volkova, 'socialvec')

In [None]:
preds = model.predict(X_volkova)

In [None]:
df_volkova['preds'] = tf.argmax(preds, axis=1)
df_volkova['confidence'] = np.max(preds, axis=1)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_volkova['class'], df_volkova['preds']))

In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix
# y_test  : actual labels or target
# y_preds : predicted labels or target
sns.heatmap(confusion_matrix(df_volkova['class'], df_volkova['preds']),square=True, annot=True, cmap='Blues', fmt='d', cbar=False);

In [None]:
df_volkova_high_confidence = df_volkova[df_volkova['confidence']> 0.9]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(df_volkova_high_confidence['class'], df_volkova_high_confidence['preds']))

In [None]:
sns.heatmap(confusion_matrix(df_volkova_high_confidence['class'], df_volkova_high_confidence['preds']),square=True, annot=True, cmap='Blues', fmt='d', cbar=False);