# This code is for HOS using multilingual Embeddings for three Dravidian CodeMix languages

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Packages to be installed

In [None]:
%pip install -U sentence-transformers

## Import the required packages

In [None]:
# packages
import pandas as pd 
from collections import Counter
from sentence_transformers import SentenceTransformer 
import numpy as np
import sklearn
from sklearn.metrics import (
    precision_score, 
    recall_score, 
    f1_score, 
    accuracy_score, 
    mean_squared_error, 
    mean_absolute_error,
    classification_report,
    confusion_matrix
)

### Read the data

In [5]:
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/reduced dataset/tiny/tiny train.csv")
test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/reduced dataset/tiny/tiny test.csv",header=None)

### Sperate the train and test senetnecs and labels to a list

In [None]:
train_sent_m=[]
train_label_m=[]
test_sent_m = []
test_label_m = []
print(train.columns)
print(test.columns)
for i in train['0']:
    train_sent_m.append(i.split('\t')[0])
    train_label_m.append(i.split('\t')[0])

for i in test[0]:
    test_sent_m.append(i.split('\t')[0])
    test_label_m.append(i.split('\t')[0])

In [None]:
print(f"Train sentences: {len(train_sent_m)}, Train labels: {len(train_label_m)}")
print(f"Test sentences: {len(test_sent_m)}, Test labels: {len(test_label_m)}")


### Label Encoding

In [11]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
train_labels_encoded = le.fit_transform(train_label_m)
dev_labels_encoded = le.fit_transform(test_label_m)

### Get Embeddings

In [None]:
trans_model = SentenceTransformer('bert-base-multilingual-cased')
# here other multilingual embeddings can be loaded

In [13]:
train_sentence_embeddings = trans_model.encode(train_sent_m)
dev_sentence_embeddings = trans_model.encode(test_sent_m)

### Weight calculation

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Correct usage
class_weights = compute_class_weight(
    class_weight='balanced',  # Strategy
    classes=np.unique(train_label_m),  # Unique class labels
    y=train_label_m  # Training labels
)

print("Class Weights:", class_weights)

### Classification

### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# Ensure LabelEncoder is fit on all unique labels across train and test sets
all_labels = list(set(train_label_m) | set(test_label_m))  # Union of train and test labels
le = LabelEncoder()
le.fit(all_labels)

# Encode train and test labels
train_labels_encoded = le.transform(train_label_m)
dev_labels_encoded = le.transform(test_label_m)

# Define class weights
class_weight = {0: 0.22607331, 1: 23.54117647, 2: 13.69505703, 3: 17.07014218, 4: 2.484}

# Train Logistic Regression model
model = LogisticRegression(class_weight=class_weight, max_iter=10000)
model.fit(train_sentence_embeddings, train_labels_encoded)

# Make predictions
predicted = model.predict(dev_sentence_embeddings)

# Evaluate metrics
print("Eval Scores:")
accuracy = accuracy_score(dev_labels_encoded, predicted)
recall = recall_score(dev_labels_encoded, predicted, average="macro", zero_division=1)
precision = precision_score(dev_labels_encoded, predicted, average="macro", zero_division=1)
f1 = f1_score(dev_labels_encoded, predicted, average="macro", zero_division=1)

print("Macro Metrics:")
print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

# Generate classification report
unique_labels = sorted(set(dev_labels_encoded))  # Ensure we match actual labels present in the dataset
target_names = le.inverse_transform(unique_labels)  # Map encoded labels back to original labels

print("\nClassification Report:")
print(classification_report(dev_labels_encoded, predicted, labels=unique_labels, target_names=target_names, zero_division=1))

# Save predictions
predictions = list(le.inverse_transform(predicted))  # Decode predictions back to original labels
classified_df = pd.DataFrame({'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('Logistic_regression.csv', index=False)
print("Predictions saved to 'Logistic_regression.csv'")


ValueError                                Traceback (most recent call last)
<ipython-input-18-2e0501ab8853> in <cell line: 20>()
     18 # Train Logistic Regression model
     19 model = LogisticRegression(class_weight=class_weight, max_iter=10000)
---> 20 model.fit(train_sentence_embeddings, train_labels_encoded)
     21 
     22 # Make predictions

8 frames
/usr/local/lib/python3.10/dist-packages/sklearn/utils/class_weight.py in compute_class_weight(class_weight, classes, y)
     85         if unweighted_classes and n_weighted_classes != len(class_weight):
     86             unweighted_classes_user_friendly_str = np.array(unweighted_classes).tolist()
---> 87             raise ValueError(
     88                 f"The classes, {unweighted_classes_user_friendly_str}, are not in"
     89                 " class_weight"

ValueError: The classes, [6, 7, 8, 9, 12, 14, 16, 17, 18, 23, 24, 27, 30, 33], are not in class_weight


### Confusion Matrix

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np
import pandas as pd

# Assuming 'expected' and 'predicted' are your true and predicted labels
cfm = confusion_matrix(expected, predicted)

# Get the unique classes from the actual and predicted labels
classes = np.unique(np.concatenate((expected, predicted)))

# Convert confusion matrix to a DataFrame for better visualization
df_cfm = pd.DataFrame(cfm, index=classes, columns=classes)

# Plot confusion matrix using seaborn heatmap
plt.figure(figsize=(7, 5))
cfm_plot = sn.heatmap(df_cfm, annot=True, fmt='g', cmap='Blues')

# Save the plot as a PNG file
cfm_plot.figure.savefig("cfm_LR.png")

# Show the plot
plt.show()


### Naive Bayse

In [None]:
# NAive Baise
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import ComplementNB
model = GaussianNB()
model.fit(train_sentence_embeddings, train_labels_encoded)


# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('Naive_baise.csv')
print("prediction saved")

## Confusion matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_NB.png")

## Random Forest

In [None]:
#random forest
from sklearn.ensemble import RandomForestClassifier
class_weight = {0:0.22607331, 1:23.54117647 ,2:13.69505703 ,3:17.07014218 ,4:2.484}
model = RandomForestClassifier(n_estimators=100, class_weight=class_weight)
model = model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('RF.csv')
print("prediction saved")

## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_RF.png")

## SVM RBF

In [None]:
# SVM rbf
from sklearn.svm import SVC
from sklearn import svm
class_weight = {0:0.22607331, 1:23.54117647 ,2:13.69505703 ,3:17.07014218 ,4:2.484}
model = svm.SVC(kernel='rbf',C = 1000, class_weight =class_weight)
model = model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('SVM_RBF.csv')
print("prediction saved")


## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_SVM_rbf.png")

## SVM Poly

In [None]:
# SVM poly
from sklearn.svm import SVC
from sklearn import svm
model = svm.SVC(kernel='poly',C = 1000)
model = model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")


print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('SVM_poly_labse.csv')
print("prediction saved")

#Save model
import pickle
# Save the trained model as a pickle string.
pkl_filename = "poly_labse.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
file.close()

## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot =True,  fmt='g')
cfm_plot.figure.savefig("cfm_SVM_poly_labse.png")

## SVM Linear

In [None]:
# SVM Linear
from sklearn.svm import SVC
from sklearn import svm
model = svm.SVC(kernel='linear',C = 10)
model = model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('SVM_linear_labse.csv')
print("prediction saved")

#Save model
import pickle
# Save the trained model as a pickle string.
pkl_filename = "linear_labse.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(model, file)
file.close()

## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot =True,  fmt='g')
cfm_plot.figure.savefig("cfm_SVM_linear_labse.png")

## Adaboost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(n_estimators=100)
model.fit(train_sentence_embeddings, train_labels_encoded)

# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)

print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('Adaboost.csv')
print("prediction saved")

## Confusion matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_adaboost.png")

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(weights = 'distance')
model.fit(train_sentence_embeddings, train_labels_encoded)


# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('KNN.csv')
print("prediction saved")


## Confusion Matrix

In [None]:
from sklearn.metrics import plot_confusion_matrix
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
cfm= confusion_matrix(expected, predicted)
classes = np.unique(train_label_m)

df_cfm = pd.DataFrame(cfm, index = classes, columns = classes)
plt.figure(figsize = (7,5))
cfm_plot = sn.heatmap(df_cfm, annot=True,  fmt='g')
cfm_plot.figure.savefig("cfm_KNN.png")

## Decision Tree

In [None]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
class_weight = {0:0.22607331, 1:23.54117647 ,2:13.69505703 ,3:17.07014218 ,4:2.484}
model = DecisionTreeClassifier(class_weight=class_weight)
model.fit(train_sentence_embeddings, train_labels_encoded)


# make predictions
expected = dev_labels_encoded
predicted = model.predict(dev_sentence_embeddings)


print("eval scores")
accuracy = accuracy_score(expected, predicted)
recall = recall_score(expected, predicted , average="macro")
precision = precision_score(expected, predicted , average="macro")
f1 = f1_score(expected, predicted, average="macro")

print("macro")
print("accuracy")
print("%.3f" %accuracy)
print("precision")
print("%.3f" %precision)
print("racall")
print("%.3f" %recall)
print("f1score")
print("%.3f" %f1)



print("Classification report")
import sklearn
from sklearn.metrics import classification_report
sklearn.metrics.classification_report(expected, predicted)
target_names = ['Not_offensive', 'Offensive_Targeted_Insult_Group', 'Offensive_Targeted_Insult_Individual', 'Offensive_Untargetede', 'not-malayalam']
print(classification_report(expected, predicted, target_names=target_names))


#Saving the predictions
import csv
predictions = list(le.inverse_transform(predicted))
classified_df = pd.DataFrame( {'tweets': test_sent_m, 'actual_label': test_label_m, 'predictions': predictions})
classified_df.to_csv('DT.csv')
print("prediction saved")
