In [1]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Visualization libraries
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.cm as cm 
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image

# Text analysis and NLP libraries
import re
import time
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler

from nltk.corpus import stopwords
import gensim
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from scipy.sparse import hstack

# Machine learning libraries
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.preprocessing import FunctionTransformer
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MaxAbsScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout, BatchNormalization, Conv1D, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Bidirectional
import tensorflow as tf

import joblib
import pickle

# Download NLTK data at the beginning of the script
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('stopwords')




# Emotional Detection

## Importing the datasets

In [2]:
#read the dataset
kaggle_df = pd.read_csv('./data/tweet_emotions.csv').drop('tweet_id', axis=1)
kaggle_df = kaggle_df.rename(columns={'content': 'text', 'sentiment': 'label'})
huggingface_df = pd.read_parquet('./data/huggingface_emotions.parquet')
# Replace numerical labels with corresponding emotions gotten from the metadata on hugging face's website
label_to_emotion = {
    0: 'sadness',
    1: 'joy',
    2: 'love',
    3: 'anger',
    4: 'fear',
    5: 'surprise'
}

# Mapping the enocded labels to corresponding emotions 
huggingface_df['label'] = huggingface_df['label'].map(label_to_emotion)

# Merge the datasets based on the common columns
emotional_detection = pd.concat([huggingface_df, kaggle_df])

# Display the first few rows of the merged dataset
emotional_detection.head()

Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,sadness
1,im alone i feel awful,sadness
2,ive probably mentioned this before but i reall...,joy
3,i was feeling a little low few days back,sadness
4,i beleive that i am much more sensitive to oth...,love


# Data Preparation and Preprocessing

# Aggregate Duplicates by Median

In [3]:
# Get the duplicated rows based on 'text' and 'label'
duplicated_rows = emotional_detection[emotional_detection.duplicated(subset=['text', 'label'])]

# Order the result by the 'text' column
duplicated_rows_sorted = duplicated_rows.sort_values('text')
duplicated_rows_sorted[1:20]

Unnamed: 0,text,label
31363,#frenchieb-day #frenchieb-day #frenchieb-day #...,neutral
29869,0,neutral
39415,0,neutral
30644,@JonathanRKnight BTW I STILL can't believe how...,happiness
39260,@RealHughJackman Wolverine is awesome.. love i...,love
37781,@andyclemmensen have you seen the game on the ...,happiness
10918,@ericbolling Where's Dani Babb?,worry
37807,@mari_possa Happy Happy Bday Baby Girl. Love Y...,love
30257,@mcraddictal,neutral
32098,@thecompletes seen u a couple of times. Liked it,love


In [4]:
# Count the occurrences of each label for each text
label_counts = emotional_detection.groupby(['text', 'label']).size().reset_index(name='count')

# Find the label with the highest count for each text
idx = label_counts.groupby(['text'])['count'].transform(max) == label_counts['count']
most_frequent_labels = label_counts[idx][['text', 'label']]

# Merge the most frequent labels back to the original DataFrame
aggregated_df = pd.merge(emotional_detection, most_frequent_labels, on='text', how='inner')

# Drop duplicate rows based on the selected label
aggregated_df = aggregated_df.drop_duplicates(subset=['text', 'label_y'])

# Rename the 'label_y' column to 'aggregated_label'
aggregated_df.rename(columns={'label_y': 'aggregated_label'}, inplace=True)

# Print or display the result
print("DataFrame with aggregated labels for each text:")
print(aggregated_df[['text', 'aggregated_label']])

DataFrame with aggregated labels for each text:
                                                     text aggregated_label
0       i feel awful about it too because it s my job ...          sadness
1                                   im alone i feel awful          sadness
2       ive probably mentioned this before but i reall...              joy
3                i was feeling a little low few days back          sadness
4       i beleive that i am much more sensitive to oth...             love
...                                                   ...              ...
501577                                   @JohnLloydTaylor          neutral
501578                     Happy Mothers Day  All my love             love
501579  Happy Mother's Day to all the mommies out ther...             love
501580  @niariley WASSUP BEAUTIFUL!!! FOLLOW ME!!  PEE...        happiness
501581  @mopedronin bullet train from tokyo    the gf ...             love

[455989 rows x 2 columns]


In [5]:
emotional_detection = aggregated_df[['text', 'aggregated_label']]

In [6]:
# Rename the 'aggregated_label' column to 'label'
emotional_detection.rename(columns={'aggregated_label': 'label'}, inplace=True)
# Print or display the result
print("DataFrame with aggregated labels for each text:")
emotional_detection[['text', 'label']]

DataFrame with aggregated labels for each text:


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  emotional_detection.rename(columns={'aggregated_label': 'label'}, inplace=True)


Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,sadness
1,im alone i feel awful,sadness
2,ive probably mentioned this before but i reall...,joy
3,i was feeling a little low few days back,sadness
4,i beleive that i am much more sensitive to oth...,love
...,...,...
501577,@JohnLloydTaylor,neutral
501578,Happy Mothers Day All my love,love
501579,Happy Mother's Day to all the mommies out ther...,love
501580,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,happiness


# Data Splitting

In [7]:
X=emotional_detection['text']
y=emotional_detection['label']

In [8]:
## split the data into training and testing to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

In [9]:
#train dataset
train_df = pd.concat([X_train, y_train], axis=1)
#test dataset
test_df = pd.concat([X_test, y_test], axis=1)

In [10]:
# Replace 'happiness' with 'joy' in the 'label' column since they are synonyms
train_df['label'] = train_df['label'].replace('happiness', 'joy')
test_df['label'] = test_df['label'].replace('happiness', 'joy')

In [11]:
def preprocess_text_column(df, text_column_name):
    # Make a copy of the DataFrame to avoid SettingWithCopyWarning
    df_copy = df.copy()

    # Initialize the WordNet lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Define a regular expression pattern to remove punctuation
    punctuation_pattern = r'[^\w\s]'

    # Get the English stopwords list
    english_stopwords = set(stopwords.words('english'))

    # Define a function for cleaning, tokenizing, and lemmatizing text
    def preprocess_text(text):
        # Remove punctuation and lowercase text
        text = re.sub(punctuation_pattern, ' ', text.lower())

        # Tokenize the text
        words = text.split()

        # Remove stop words and lemmatize each word
        words = [lemmatizer.lemmatize(word) for word in words if word not in english_stopwords]

        # Join the words back into a sentence
        return ' '.join(words)

    # Apply the preprocessing function to the text column of the copy
    df_copy[text_column_name] = df_copy[text_column_name].apply(preprocess_text)

    return df_copy

In [12]:
train_preprocessed = preprocess_text_column(train_df, 'text')
test_preprocessed = preprocess_text_column(test_df, 'text')

# Feature Engineering

In [13]:
## Sentiment Tagging
def get_sentiment(text):
    # Create a TextBlob object for the input text
    blob = TextBlob(text)
    
    # Get the polarity (-1 to 1) where -1 is negative, 1 is positive, and 0 is neutral
    polarity = blob.sentiment.polarity
    
    # Determine the sentiment label based on polarity
    if polarity > 0:
        sentiment = 'positive'
    elif polarity < 0:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    
    return sentiment, polarity

In [14]:
## Get Polarity Score
def get_text_polarity(text):
    # Create a TextBlob object for the input text
    blob = TextBlob(text)
    
    # Get the polarity (-1 to 1) where -1 is negative, 1 is positive, and 0 is neutral
    polarity = blob.sentiment.polarity
    return polarity

In [15]:
# Apply the get_text_polarity function to the 'Text' column and create a new column 'Polarity'
train_preprocessed['Polarity'] = train_preprocessed['text'].apply(get_text_polarity)
test_preprocessed['Polarity'] = test_preprocessed['text'].apply(get_text_polarity)

In [16]:
# Apply the get_sentiment function to the 'Text' column and create new columns 'Sentiment Label' and 'Sentiment Polarity'
train_preprocessed['Sentiment'], train_preprocessed['Polarity'] = zip(*train_preprocessed['text'].apply(get_sentiment))
test_preprocessed['Sentiment'], test_preprocessed['Polarity'] = zip(*test_preprocessed['text'].apply(get_sentiment))

In [17]:
# split the preprocessed data
X_train = train_preprocessed['text']
y_train = train_preprocessed['label']
X_test = test_preprocessed['text']
y_test = test_preprocessed['label']

In [18]:
from imblearn.over_sampling import SMOTENC

X_train = X_train.to_frame()
# Identify categorical features based on data types
categorical_features = [X_train[col].dtype == 'object' for col in X_train.columns]

# Now add False for numerical features
categorical_features += [False] * (len(X_train.columns) - sum(categorical_features))

# Instantiate SMOTENC
from imblearn.over_sampling import SMOTEN

# Instantiate SMOTEN
sampling_strategy = {
    'hate': 30000,
    'enthusiasm': 30000,
    'fun': 30000,
    'empty': 30000,
    'relief': 30000,
    'boredom': 30000,
    'worry': 20000,
    'surprise': 15000,
    'neutral': 15000,
}

smoten = SMOTEN(sampling_strategy=sampling_strategy, random_state=42)

# Apply SMOTEN to generate synthetic samples
X_train, y_train = smoten.fit_resample(X_train, y_train)

In [19]:
# # Flatten the NumPy array
X_train_flat = X_train.values.flatten()

# # Convert the flattened array to a Pandas Series
X_train = pd.Series(X_train_flat)

# Data Analysis

# BiLSTM Model

In [None]:
# Define a mapping for your labels
label_mapping = {'joy': 0, 'love': 1, 'fear': 2, 'anger': 3, 'neutral': 4, 'sadness': 5, 'surprise': 6, 'worry': 7, 
                 'hate':8, 'enthusiasm':9, 'fun':10, 'empty':11, 'relief':12, 'boredom':13}

# Convert string labels to numerical labels using the mapping
y_train_numeric = np.array([label_mapping[label] for label in y_train])
y_test_numeric = np.array([label_mapping[label] for label in y_test])

# Tokenize the text data
max_words = 5000
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Pad sequences to have the same length
max_sequence_length = max(len(seq) for seq in X_train_sequences)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Convert the target labels to one-hot encoding
y_train_onehot = to_categorical(y_train_numeric)
y_test_onehot = to_categorical(y_test_numeric)

# Build the BiLSTM model
embedding_dim = 128
num_classes = y_train_onehot.shape[1]
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(units=100)))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile the BiLSTM model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Calculate class weights to address class imbalance
class_weights = compute_class_weight('balanced', classes=np.unique(y_train_numeric), y=y_train_numeric)
class_weight_dict = dict(enumerate(class_weights))

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the BiLSTM model
history = model.fit(X_train_padded, y_train_onehot, validation_data=(X_test_padded, y_test_onehot),
                    epochs=20, batch_size=64, class_weight=class_weight_dict, callbacks=[early_stopping])

# Evaluate the BiLSTM model
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test_onehot)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Predict on the test set with the BiLSTM model
y_pred = model.predict(X_test_padded)
y_pred_classes = y_pred.argmax(axis=-1)

# Convert numerical labels back to string labels using the reverse mapping
reverse_label_mapping = {v: k for k, v in label_mapping.items()}
y_test_original = np.array([reverse_label_mapping[label] for label in y_test_numeric])
y_pred_original = np.array([reverse_label_mapping[label] for label in y_pred_classes])


# Evaluate the BiLSTM model
test_loss, test_accuracy = model.evaluate(X_test_padded, y_test_onehot)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_accuracy}")

# Print the classification report
print(classification_report(y_test_original, y_pred_original))



Epoch 1/20


Epoch 2/20
Epoch 3/20
Epoch 4/20

In [None]:
# Save the BiLSTM model
model.save('./models/BILSTM for Transfer Learning/bilstm_model.h5')

# Transfer Learning to Cyberbullying Data

In [None]:
# The data are sources from twitter, kaggle, wikepedia talk page, youtube
aggression = pd.read_csv('data/aggression_parsed_dataset.csv')
attack = pd.read_csv('data/attack_parsed_dataset.csv')
toxicity = pd.read_csv('data/toxicity_parsed_dataset.csv')
racism = pd.read_csv('data/twitter_racism_parsed_dataset.csv')
sexism = pd.read_csv('data/twitter_sexism_parsed_dataset.csv')
kaggle = pd.read_csv('data/kaggle_parsed_dataset.csv')
twitter = pd.read_csv('data/twitter_parsed_dataset.csv')
youtube = pd.read_csv('data/youtube_parsed_dataset.csv')

In [None]:
# Add a new column to each DataFrame indicating the source dataset
aggression['source'] = 'aggression'
attack['source'] = 'attack'
toxicity['source'] = 'toxicity'
racism['source'] = 'racism'
sexism['source'] = 'sexism'
kaggle['source'] = 'kaggle'
twitter['source'] = 'twitter'
youtube['source'] = 'youtube'

# Concatenate all DataFrames along with the newly added 'source' column
cyberbullying_data = pd.concat([aggression, attack, toxicity, racism, sexism, kaggle, twitter, youtube], ignore_index=True)

# Now you have a single DataFrame containing all the data from different sources
# with an additional 'source' column to indicate the dataset origin

In [None]:
# List of column names to drop
columns_to_drop = ['id', 'Annotation', 'Date', 'UserIndex',
                   'Number of Comments', 'Number of Subscribers', 'Membership Duration',
                   'Number of Uploads', 'Profanity in UserID', 'Age', 'index']


# Drop the specified columns from the DataFrame
cyberbullying_data = cyberbullying_data.drop(columns=columns_to_drop)

cyberbullying_data

# Data Cleaning and Preprocessing

In [None]:
# Remove duplicates based on the 'Text' column
cyberbullying_data = cyberbullying_data.drop_duplicates(subset=['Text'])

In [None]:
# Drop columns ed_label_0 and ed_label_1
cyberbullying_data = cyberbullying_data.drop(['ed_label_0', 'ed_label_1'], axis=1)

# Rename column oh_label to cyberbullying
cyberbullying_data = cyberbullying_data.rename(columns={'oh_label': 'cyberbullying'})

# Display the modified DataFrame
cyberbullying_data

In [None]:
cyberbullying_data = cyberbullying_data.dropna(subset=['Text'])

In [None]:
# Load the saved tokenizer
new_data_sequences = tokenizer.texts_to_sequences(cyberbullying_data['Text'])
new_data_padded = pad_sequences(new_data_sequences, maxlen=max_sequence_length, padding='post')

new_data_predictions = model.predict(new_data_padded)
new_data_predictions_classes = new_data_predictions.argmax(axis=-1)

emotion_cyberbullying_data = np.array([reverse_label_mapping[label] for label in new_data_predictions_classes])

In [None]:
cyberbullying_data.loc[:, 'Emotion_Label'] = emotion_cyberbullying_data
cyberbullying_data

In [None]:
cyberbullying_data_processed = preprocess_text_column(cyberbullying_data, 'Text')

In [None]:
cyberbullying_data_processed.dropna(inplace=True)

In [None]:
cyberbullying_data_processed.to_csv('./data/cyberbullying_preprocessed.csv')

# Data Analysis

In [None]:
# Split your data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    cyberbullying_data_processed['Text'],
    cyberbullying_data_processed[['cyberbullying', 'Emotion_Label']],
    test_size=0.2,
    random_state=42
)

# LSTM

In [None]:
# Assuming you have already imported and processed your data (X_train, X_test, y_train, y_test)

# Convert labels to strings
y_train['cyberbullying'] = y_train['cyberbullying'].astype(str)
y_train['Emotion_Label'] = y_train['Emotion_Label'].astype(str)

y_test['cyberbullying'] = y_test['cyberbullying'].astype(str)
y_test['Emotion_Label'] = y_test['Emotion_Label'].astype(str)

# Tokenize and pad sequences
max_words = 10000  # You can adjust this based on your data
max_len = 100  # You can adjust this based on your data

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

# Convert labels to binary format
mlb = MultiLabelBinarizer()
y_train_binary = mlb.fit_transform(y_train.values)
y_test_binary = mlb.transform(y_test.values)

# Build the neural network model (LSTM + CNN)
num_classes = len(mlb.classes_)
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the model with early stopping
model.fit(X_train_padded, y_train_binary, epochs=50, batch_size=32, validation_split=0.1, callbacks=[early_stopping])

# Evaluate the model on the test set
y_pred = model.predict(X_test_padded)
y_pred_binary = (y_pred > 0.5).astype(int)

# Convert predictions back to original labels
y_pred_labels = mlb.inverse_transform(y_pred_binary)

# Evaluate the model on the test set
y_test_binary = mlb.transform(y_test.values)
accuracy = accuracy_score(y_test_binary, y_pred_binary)
print(f"Test Accuracy: {accuracy}")

In [None]:
# Save the entire model to a HDF5 file
model.save("./models/emotioncyberbullying/NN_cyberbullying_emotion.h5")

# Save MultiLabelBinarizer
with open("./models/emotioncyberbullying/mlb.pkl", "wb") as mlb_file:
    pickle.dump(mlb, mlb_file)

# Save Tokenizer
with open("./models/emotioncyberbullying/tokenizer.pkl", "wb") as tokenizer_file:
    pickle.dump(tokenizer, tokenizer_file)

In [None]:
# Example sentence
new_sentence = "This is an example sentence for prediction."

# Preprocess and tokenize the new sentence
new_sentence_seq = tokenizer.texts_to_sequences([new_sentence])
new_sentence_padded = pad_sequences(new_sentence_seq, maxlen=max_len, padding='post')

# Predict using the trained model
new_sentence_pred = model.predict(new_sentence_padded)
new_sentence_pred_binary = (new_sentence_pred > 0.5).astype(int)

# Convert predictions back to original labels
new_sentence_labels = mlb.inverse_transform(new_sentence_pred_binary)

# Print the predicted labels
print("Predicted Labels:", new_sentence_labels)

In [None]:
# Example sentence
new_sentence = "I hate you"

# Preprocess and tokenize the new sentence
new_sentence_seq = tokenizer.texts_to_sequences([new_sentence])
new_sentence_padded = pad_sequences(new_sentence_seq, maxlen=max_len, padding='post')

# Predict using the trained model
new_sentence_pred = model.predict(new_sentence_padded)
new_sentence_pred_binary = (new_sentence_pred > 0.5).astype(int)

# Convert predictions back to original labels
new_sentence_labels = mlb.inverse_transform(new_sentence_pred_binary)

# Print the predicted labels
print("Predicted Labels:", new_sentence_labels)

In [None]:
# Example sentence
new_sentence = "what you are doing is bad"

# Preprocess and tokenize the new sentence
new_sentence_seq = tokenizer.texts_to_sequences([new_sentence])
new_sentence_padded = pad_sequences(new_sentence_seq, maxlen=max_len, padding='post')

# Predict using the trained model
new_sentence_pred = model.predict(new_sentence_padded)
new_sentence_pred_binary = (new_sentence_pred > 0.5).astype(int)

# Convert predictions back to original labels
new_sentence_labels = mlb.inverse_transform(new_sentence_pred_binary)

# Print the predicted labels
print("Predicted Labels:", new_sentence_labels)

In [None]:
# Example sentence
new_sentence = "I hate when I think about you"

# Preprocess and tokenize the new sentence
new_sentence_seq = tokenizer.texts_to_sequences([new_sentence])
new_sentence_padded = pad_sequences(new_sentence_seq, maxlen=max_len, padding='post')

# Predict using the trained model
new_sentence_pred = model.predict(new_sentence_padded)
new_sentence_pred_binary = (new_sentence_pred > 0.5).astype(int)

# Convert predictions back to original labels
new_sentence_labels = mlb.inverse_transform(new_sentence_pred_binary)

# Print the predicted labels
print("Predicted Labels:", new_sentence_labels)

In [None]:
# Example sentence
new_sentence = "I hate when I think about the future"

# Preprocess and tokenize the new sentence
new_sentence_seq = tokenizer.texts_to_sequences([new_sentence])
new_sentence_padded = pad_sequences(new_sentence_seq, maxlen=max_len, padding='post')

# Predict using the trained model
new_sentence_pred = model.predict(new_sentence_padded)
new_sentence_pred_binary = (new_sentence_pred > 0.5).astype(int)

# Convert predictions back to original labels
new_sentence_labels = mlb.inverse_transform(new_sentence_pred_binary)

# Print the predicted labels
print("Predicted Labels:", new_sentence_labels)

In [None]:
# Example sentence
new_sentence = "I am so happy today"

# Preprocess and tokenize the new sentence
new_sentence_seq = tokenizer.texts_to_sequences([new_sentence])
new_sentence_padded = pad_sequences(new_sentence_seq, maxlen=max_len, padding='post')

# Predict using the trained model
new_sentence_pred = model.predict(new_sentence_padded)
new_sentence_pred_binary = (new_sentence_pred > 0.5).astype(int)

# Convert predictions back to original labels
new_sentence_labels = mlb.inverse_transform(new_sentence_pred_binary)

# Print the predicted labels
print("Predicted Labels:", new_sentence_labels)

# Model Comparaison by Predicting on New Dataset

In [None]:
new_data = pd.read_csv("data/Twitter_CyberBullying_Comments_Unseen_Dataset.csv")
new_data.dropna(inplace=True)

In [None]:
# load the model
loaded_model = load_model('./models/emotioncyberbullying/NN_cyberbullying_emotion.h5')  

In [None]:
# Data Processing the new data
X_new_seq = tokenizer.texts_to_sequences(new_data['Text'])
X_new_padded = pad_sequences(X_new_seq, maxlen=max_len, padding='post')

In [None]:
# Predict on the new data
y_new_pred = loaded_model.predict(X_new_padded)
y_new_pred_binary = (y_new_pred > 0.5).astype(int)

# Convert predictions back to original labels
y_new_pred_labels = mlb.inverse_transform(y_new_pred_binary)

In [None]:
new_data['predicted_labels'] = y_new_pred_labels

In [None]:
new_data['predicted_labels']

In [None]:
new_data['CB_pred']=new_data['predicted_labels'].str[0]
new_data['emotion_pred']=new_data['predicted_labels'].str[1]

In [None]:
new_data

In [None]:
# Assuming new_data is your DataFrame
new_data['CB_pred'] = pd.to_numeric(new_data['CB_pred'], errors='coerce')
new_data['CB_pred'] = new_data['CB_pred'].fillna(0)

In [None]:
new_data.info()

In [None]:
#model 2 predictions
fairnesscyberbullying_pred=pd.read_csv('data/fairnesscyberbullying_pred.csv')
fairnesscyberbullying_pred

In [None]:
ieee_model_accuracy = accuracy_score(fairnesscyberbullying_pred['CB_Label'], fairnesscyberbullying_pred['fairnesscyberbullying_pred'])
ieee_model_accuracy

In [None]:
mymodel_accuracy = accuracy_score(new_data['CB_Label'], new_data['CB_pred'])
mymodel_accuracy

# Paired T-test

In [None]:
# Print the accuracy scores
print("IEEE Model:", ieee_model_accuracy)
print("My model:", mymodel_accuracy)

In [None]:
from scipy import stats

# Sample data for Model 1 and Model 2 (replace with your actual data)
model1_predictions = fairnesscyberbullying_pred['fairnesscyberbullying_pred']
model2_predictions = new_data['CB_pred']
true_labels = fairnesscyberbullying_pred['CB_Label']  # Assuming CB_Label is the true label column

# Calculate the differences in accuracy between Model 1 and Model 2
differences = ieee_model_accuracy - mymodel_accuracy

# Perform the Paired T-Test
t_statistic, p_value = stats.ttest_rel(model1_predictions, model2_predictions)

# Set your chosen significance level (alpha)
alpha = 0.05

# Print the Paired T-Test results
print("Paired T-Test Results:")
print(f"t-statistic: {t_statistic}")
print(f"p-value: {p_value}")

# Check if the p-value is less than the significance level
if p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in performance.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in performance.")

# McNemar's Test

In [None]:
from sklearn.metrics import confusion_matrix
from scipy.stats import chi2_contingency

new_model1_predictions = model1_predictions
new_model2_predictions = model2_predictions
new_true_labels = new_data['CB_Label']

# Convert predictions to binary (assuming they are probabilities)
threshold = 0.5
new_model1_binary = (new_model1_predictions > threshold).astype(int)
new_model2_binary = (new_model2_predictions > threshold).astype(int)

# Create a confusion matrix for the new data
new_conf_matrix = confusion_matrix(new_true_labels, new_model2_binary)  # Fix: Use new_model2_binary here

# Extract values from the new confusion matrix
new_a = new_conf_matrix[0, 0]
new_b = new_conf_matrix[0, 1]
new_c = new_conf_matrix[1, 0]
new_d = new_conf_matrix[1, 1]

# Perform McNemar's test for the new data
new_statistic = ((new_b - new_c) ** 2) / (new_b +  new_c)
new_p_value = chi2_contingency([[new_b, new_c], [new_d, new_a]])[1]

# Set your chosen significance level (alpha)
alpha = 0.05

# Print McNemar's test results for the new data
print("New McNemar's Test Results:")
print(f"Chi-squared statistic: {new_statistic}")
print(f"p-value: {new_p_value}")

# Check if the p-value is less than the significance level
if new_p_value < alpha:
    print("Reject the null hypothesis: There is a significant difference in predictions.")
else:
    print("Fail to reject the null hypothesis: There is no significant difference in predictions.")