**Step 3. Modelling**

Delving into machine and deep learning techniques to develop sophisticated models for moral prediction.

**Machine Learning & Word Embedding**

To process and learn from text data, we use various techniques that convert words into numerical forms that a machine can understand:

*   **Bag-of-Words**: Creates a vocabulary from all the words in the text and counts how many times each word appears in each document. It's a straightforward approach to understand the prominence of words.

*   **TF-IDF (Term Frequency-Inverse Document Frequency)**: Goes beyond simple counting by considering how unique a word is to each document. It helps in emphasizing words that are important in a document but less common in the entire corpus.

*   **Word2Vec**: This method maps words into a high-dimensional space based on their usage context in the corpus. Words used in similar contexts are placed close together in this space, capturing their semantic relationships.

*   **GloVe (Global Vectors for Word Representation)**: Similar to Word2Vec, GloVe also constructs a space where words with similar meanings are closer together. It's particularly effective because it simultaneously looks at global statistics of the corpus.

In [None]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, roc_curve, confusion_matrix, multilabel_confusion_matrix, classification_report, log_loss, hamming_loss

# Set display options for pandas
pd.set_option('display.max_colwidth', None)  # Updated to None as -1 is deprecated

# Mount Google Drive (specific to Google Colab)
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir("/content/gdrive/My Drive/Colab Notebooks/PSIV")

# Load data
df = pd.read_excel('output_for_R_0.50.xlsx')  # Load specific threshold data, e.g., 0.50 threshold with n = 31,277 rows

# Install scikit-multilearn for multi-label classification (if not already installed)
!pip install scikit-multilearn

# Importing additional necessary libraries from scikit-learn
from skmultilearn.problem_transform import ClassifierChain, LabelPowerset

# Setting seeds for reproducibility
seeds = [1, 43, 678, 90, 135]

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df['new_clean_text_deep_stem'],  # Predictor variables
    df.drop(['new_clean_text_deep_stem'], axis=1),  # Target variables
    test_size=0.3,  # 70% training, 30% test
    random_state=seeds[0],  # Use the first seed from the list
    shuffle=True  # Shuffle the dataset before splitting
)

# Output the shape of the training and testing sets to verify the splits
print(f"Training data shape: {X_train.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test.shape}, {y_test.shape}")

(21893,) (21893, 11)
(9384,) (9384, 11)


In [None]:
## Text Vectorization: Classic Bag of Words Model
# Initialize a CountVectorizer to convert text to a matrix of token counts
vectorizer = CountVectorizer(max_features=5000)
vectorizer.fit(X_train)  # Fit the model to the training data

# Transform the training and testing data using the fitted vectorizer
X_train_vect = vectorizer.transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Define the target labels for the classification
labels = ['care', 'harm', 'fairness', 'cheating', 'loyalty', 'betrayal', 'authority', 'subversion', 'purity', 'degradation', 'non-moral']

## Function to run the machine learning pipeline
def run_pipeline(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)  # Fit the pipeline to the training data
    predictions = pipeline.predict(X_test)  # Make predictions on the testing data
    # Print performance metrics
    print("Accuracy:", accuracy_score(y_test, predictions))
    print("F1 Score (Weighted):", f1_score(y_test, predictions, average='weighted'))
    print("ROC AUC Score:", roc_auc_score(y_test, predictions))

# Define and evaluate multiple classifiers
print("MultinomialNB:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(MultinomialNB()))]),
             X_train_vect, y_train, X_test_vect, y_test)

print("LogisticRegression:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(LogisticRegression(solver='sag'))]),
             X_train_vect, y_train, X_test_vect, y_test)

print("LinearSVC:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(LinearSVC()))]),
             X_train_vect, y_train, X_test_vect, y_test)

print("XGBClassifier:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(xgb.XGBClassifier()))]),
             X_train_vect, y_train, X_test_vect, y_test)

LogisticRegression:
0.5180093776641091 0.6618914625084691 0.7682915987417605
None
LogisticRegression:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.5908994032395567 0.6908824887974319 0.7412523984107313
None
LinearSVC:




0.5467817561807332 0.6757134553427353 0.7580540296483739
None
XGBClassifier:
0.547847399829497 0.6247237474816334 0.6941535424862953
None


In [None]:
## Text Vectorization: TF-IDF
# Initialize a TfidfVectorizer to convert a collection of raw documents to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(
    max_features=5000,
    strip_accents='unicode',
    analyzer='word',
    ngram_range=(1, 3),
    norm='l2'
)
vectorizer.fit(X_train)  # Fit the vectorizer to the training data

# Transform the training and testing data using the fitted vectorizer
X_train_tfidf = vectorizer.transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

## Function to run machine learning pipeline and print evaluation metrics
def run_pipeline(pipeline, X_train, y_train, X_test, y_test):
    pipeline.fit(X_train, y_train)  # Fit the model to the training data
    predictions = pipeline.predict(X_test)  # Predict on the testing data
    # Print performance metrics
    print("Accuracy Score:", accuracy_score(y_test, predictions))
    print("Weighted F1 Score:", f1_score(y_test, predictions, average='weighted'))
    print("ROC AUC Score:", roc_auc_score(y_test, predictions))

# Evaluating multiple classifiers
print("MultinomialNB:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(MultinomialNB()))]),
             X_train_tfidf, y_train, X_test_tfidf, y_test)

print("LogisticRegression:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(LogisticRegression(solver='sag'))]),
             X_train_tfidf, y_train, X_test_tfidf, y_test)

print("LinearSVC:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(LinearSVC()))]),
             X_train_tfidf, y_train, X_test_tfidf, y_test)

print("XGBClassifier:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(xgb.XGBClassifier()))]),
             X_train_tfidf, y_train, X_test_tfidf, y_test)

MultinomialNB:
0.4729326513213981 0.5588070883350609 0.6423210961337116
None
LogisticRegression:
0.5680946291560103 0.6516124338278884 0.696659934083119
None
LinearSVC:
0.5919650468883205 0.6953741727561449 0.7460732769913477
None
XGBClassifier:
0.5563725490196079 0.6260073806199076 0.6940687860665743
None


In [None]:
import gensim
from gensim.models import Word2Vec

# Prepare combined dataset for Word2Vec training
combined_df = X_train.append(X_test)
Vocab_list = combined_df.apply(lambda x: str(x).strip().split())

# Train Word2Vec model
models = Word2Vec(Vocab_list, vector_size=100, window=5, min_count=1, workers=4)
WordVectorz = {word: vec for word, vec in zip(models.wv.index_to_key, models.wv.vectors)}

# Class to convert text data to averaged word vectors
class AverageEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = 100  # Dimensionality of the word vectors

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Transforms sentences to mean of the word vectors
        return np.array([
            np.mean([self.word2vec[word] for word in words if word in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

# Setting up machine learning pipelines with word vector average embedding
print("LogisticRegression:")
run_pipeline(Pipeline([
    ("wordVectz", AverageEmbeddingVectorizer(WordVectorz)),
    ("multilabel", MultiOutputClassifier(LogisticRegression()))
]), X_train, y_train, X_test, y_test)

print("LinearSVC:")
run_pipeline(Pipeline([
    ("wordVectz", AverageEmbeddingVectorizer(WordVectorz)),
    ("multilabel", MultiOutputClassifier(LinearSVC()))
]), X_train, y_train, X_test, y_test)

print("XGBClassifier:")
run_pipeline(Pipeline([
    ("wordVectz", AverageEmbeddingVectorizer(WordVectorz)),
    ("multilabel", MultiOutputClassifier(xgb.XGBClassifier()))
]), X_train, y_train, X_test, y_test)

LogisticRegression:
0.17604433077578857 0.19238847558186056 0.5079920173797385
LinearSVC:
0.20737425404944587 0.21859851818446543 0.5116604892614635
XGBClassifier:
0.2693947144075021 0.3128694845732286 0.5498994655478208


In [None]:
nltk.download('punkt')
from tqdm import tqdm

# Initialize tokenizer with a maximum of 5000 words
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(df['new_clean_text_deep_stem'])
sequences = tokenizer.texts_to_sequences(df['new_clean_text_deep_stem'])

# Pad sequences to ensure uniform input size
x = pad_sequences(sequences, maxlen=200)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    x,
    df[df.columns[1:]],  # Assuming the target variables start from the second column
    test_size=0.3,
    random_state=seeds[4]
)

# Load the GloVe pre-trained word vectors
embeddings_dictionary = dict()
glove_file = open('glove.6B.100d.txt', encoding="utf8")

for line in glove_file:
    records = line.split()
    word = records[0]
    vector_dimensions = np.asarray(records[1:], dtype='float32')
    embeddings_dictionary[word] = vector_dimensions
glove_file.close()

# Function to convert sentences to vectors
def sent2vec(s):
    words = word_tokenize(str(s).lower())
    M = [embeddings_dictionary.get(w, np.zeros(100)) for w in words]  # Default to zero vector if word not found
    M = np.array(M)
    v = M.sum(axis=0)
    return v / np.sqrt((v ** 2).sum()) if np.linalg.norm(v) else np.zeros(100)

# Vectorize training and testing data
X_train = [sent2vec(x) for x in tqdm(X_train, desc="Vectorizing training data")]
X_test = [sent2vec(x) for x in tqdm(X_test, desc="Vectorizing testing data")]

# Machine learning pipelines
print("LogisticRegression:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(LogisticRegression()))]),
             X_train, y_train, X_test, y_test)

print("LinearSVC:")
run_pipeline(Pipeline([('clf', MultiOutputClassifier(LinearSVC()))]),
             X_train, y_train, X_test, y_test)

100%|██████████| 21893/21893 [00:23<00:00, 950.88it/s]
100%|██████████| 9384/9384 [00:10<00:00, 935.31it/s]


LogisticRegression:
0.05530690537084399 0.057154021028701325 0.5032139860446211
None
LinearSVC:
0.2405157715260017 0.24976911054276785 0.5106806962207272
None


**Deep Learning**

To boost performance in sentiment analysis within Natural Language Processing (NLP), we explore advanced deep learning techniques including:

*   **DNN (Deep Neural Network)**
*   **CNN (Convolutional Neural Network)**
*   **LSTM (Long Short-Term Memory) & BiLSTM (Bidirectional Long Short-Term Memory)**: These models excel in processing sequential data, making them particularly suited for tasks where the order of elements is crucial, such as text processing. They are capable of remembering information over extended periods, an essential feature in NLP for preserving context in sentences or more extensive text passages.

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import keras.models
from keras.models import Sequential, Model
from keras.layers import Embedding, SpatialDropout1D, Flatten, Dense, LSTM, GlobalMaxPool1D, Activation, Conv1D, Input, Bidirectional
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from keras import backend as K
import tensorflow as tf
from sklearn.model_selection import train_test_split
!pip install tensorflow-addons
import tensorflow_addons as tfa

# Tokenizer configuration
tokenizer = Tokenizer(num_words=5000, lower=True)
tokenizer.fit_on_texts(df['new_clean_text_deep_stem'])
sequences = tokenizer.texts_to_sequences(df['new_clean_text_deep_stem'])
x = pad_sequences(sequences, maxlen=200)
print('Shape of data tensor:', x.shape)

# Define seeds for reproducibility
seeds = [1, 43, 678, 90, 135]
X_train, X_test, y_train, y_test = train_test_split(x, df.iloc[:, 1:], test_size=0.3, random_state=seeds[0])
print('Training data shape:', X_train.shape, y_train.shape)
print('Testing data shape:', X_test.shape, y_test.shape)

# Calculate class weights for imbalanced classes
most_common_cat = pd.DataFrame()
most_common_cat['cat'] = df.columns[1:]
most_common_cat['count'] = df.iloc[:, 1:].sum().values
most_common_cat.sort_values('count', inplace=True, ascending=False)
most_common_cat.reset_index(drop=True, inplace=True)

# Adjust class weights inversely proportional to class frequencies
most_common_cat['class_weight'] = len(most_common_cat) / most_common_cat['count']
class_weight = {index: most_common_cat[most_common_cat['cat'] == label]['class_weight'].values[0] for index, label in enumerate(df.columns[1:])}

# Display the first few rows of the class weights DataFrame
most_common_cat.head()

Shape of data tensor: (31277, 200)
(21893, 200) (21893, 11)
(9384, 200) (9384, 11)


Unnamed: 0,cat,count,class_weight
0,non-moral,14649,0.000751
1,harm,3966,0.002774
2,cheating,3724,0.002954
3,care,2550,0.004314
4,fairness,2300,0.004783


In [None]:
num_classes = y_train.shape[1]
max_words = len(tokenizer.word_index) + 1
maxlen = 200
filter_length = 300

# Define a simple DNN model
def getModel_dnn():
    model = Sequential([
        Embedding(max_words, 128, input_length=maxlen),
        GlobalMaxPool1D(),
        Dense(num_classes, activation='sigmoid'),
    ], name="DNN_Model")
    return model

# Define a CNN model
def getModel_cnn():
    model = Sequential([
        Embedding(max_words, 128, input_length=maxlen),
        Conv1D(filter_length, 3, padding='valid', activation='relu', strides=1),
        GlobalMaxPool1D(),
        Dense(num_classes, activation='sigmoid'),
    ], name="CNN_Model")
    return model

# Define an LSTM model
def getModel_lstm():
    model = Sequential([
        Embedding(max_words, 128, input_length=maxlen),
        SpatialDropout1D(0.2),
        LSTM(128, dropout=0.2, recurrent_dropout=0.0),
        Dense(num_classes, activation='sigmoid'),
    ], name="LSTM_Model")
    return model

# Define a BiLSTM model
def getModel_bilstm():
    model = Sequential([
        Embedding(max_words, 128, input_length=maxlen),
        SpatialDropout1D(0.2),
        Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.0)),
        Dense(num_classes, activation='sigmoid'),
    ], name="BiLSTM_Model")
    return model

# Initialize and compile the LSTM model with appropriate loss function and metrics
training_model = getModel_lstm()
training_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',  # Appropriate for binary classification tasks
    metrics=[
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.AUC(),
        tfa.metrics.F1Score(num_classes=11, average='micro', threshold=0.49)
    ]
)

# Define callbacks for learning rate adjustment and model checkpointing
callbacks = [
    ReduceLROnPlateau(),  # Reduces learning rate when a metric has stopped improving
    ModelCheckpoint(filepath='model-lstm.h5', save_best_only=True)  # Saves the best model observed during training
]

# Train the model with class weights, validation split, and callbacks
history = training_model.fit(
    X_train, y_train,
    class_weight=class_weight,
    epochs=6,
    batch_size=32,
    validation_split=0.3,
    callbacks=callbacks
)

# Evaluate the model on the test set
metrics = training_model.evaluate(X_test, y_test)
print("{}: {:.2f}".format(training_model.metrics_names[1], metrics[1]))

# Optional: Code to serialize model to JSON and save weights (commented out for potential use)
# lstm_model_json = training_model.to_json()
# with open("lstm_model.json", "w") as json_file:
#     json_file.write(lstm_model_json)
# training_model.save_weights("lstm_model.h5")
# print("Saved model to disk")

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
binary_accuracy: 0.9409826993942261


In [None]:
# Initialize and compile the BiLSTM model
training_model = getModel_bilstm()
training_model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=[
        tf.keras.metrics.BinaryAccuracy(),
        tf.keras.metrics.AUC(),
        tfa.metrics.F1Score(num_classes=11, average='micro', threshold=0.49)  # Micro-average is used here, can be switched to 'weighted' if needed
    ]
)

# Define callbacks for adaptive learning rate and model checkpointing
callbacks = [
    ReduceLROnPlateau(),  # Dynamically reduce learning rate when validation performance stalls
    ModelCheckpoint(filepath='model-bilstm.h5', save_best_only=True)  # Save the best version of the model based on validation loss
]

# Train the BiLSTM model
history = training_model.fit(
    X_train, y_train,
    class_weight=class_weight,  # Handle class imbalance
    epochs=6,
    batch_size=32,
    validation_split=0.3,  # Use 30% of the data for validation
    callbacks=callbacks
)

# Evaluate the model's performance on the test dataset
metrics = training_model.evaluate(X_test, y_test)
print("{}: {:.2f}".format(training_model.metrics_names[1], metrics[1]))  # Print the second metric (AUC or Binary Accuracy based on list order)

# Optional code to serialize model to JSON and save weights (currently commented out)
# lstm_model_json = training_model.to_json()
# with open("lstm_model.json", "w") as json_file:
#     json_file.write(lstm_model_json)
# training_model.save_weights("lstm_model.h5")
# print("Saved model to disk")

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
binary_accuracy: 0.9409242272377014
