<a href="https://colab.research.google.com/github/mihneacoman/Sentiment-Analysis-IMDB-reviews/blob/main/Sentiment_Analysis_IMDB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---

---




**Importing the csv file and renaming the columns.**

---

---





In [None]:
import numpy as np
import pandas as pd
import os

from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Colab Notebooks/Proiect IMDB/"
df = pd.read_csv(path+'IMDB Dataset.csv')

Importing the required libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import spacy
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from bs4 import BeautifulSoup
from nltk.corpus import wordnet as wn
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
df = df.rename(columns={'review': 'text'})
df['sentiment'] = df['sentiment'].map({'negative':-1,'positive':1})
df



> Add blockquote



---


---


**Choosing a random sample of 2000 positive and 2000 negative reviews.**

---


---





In [None]:
print(df['sentiment'].value_counts())

# Impart setul de date dupa sentiment (negativ si pozitiv)
df_negative = df[df['sentiment'] == -1]
df_positive = df[df['sentiment'] == 1]

min_samples = 2000

# Selectam random 2000 de exemple din fiecare sentiment
df_negative_sampled = df_negative.sample(n=min_samples)
df_positive_sampled = df_positive.sample(n=min_samples)

# Le combinam intr-un singur dataframe
df_sampled = pd.concat([df_negative_sampled, df_positive_sampled])

# Amestecam randurile
df_sampled = df_sampled.sample(frac=1).reset_index(drop=True)

print(df_sampled['sentiment'].value_counts())

df_sampled




---



---
**Analiza setului de date**


---



---




In [None]:
label_map = {-1: "negativ", 1: "pozitiv"}
sentiment_counts = df["sentiment"].value_counts().sort_index()
sentiment_counts.index = sentiment_counts.index.map(label_map)
sentiment_counts.plot(kind="bar", color=["red", "green"])

In [None]:
import seaborn as sns

review_len = pd.Series([len(review.split()) for review in df['text']])

fig = plt.figure(figsize=(14,7))
df['length'] = df.text.str.split().apply(len)
ax1 = fig.add_subplot(122)
sns.histplot(df[df['sentiment']==1]['length'], ax=ax1,color='magenta')
describe = df.length[df.sentiment==1].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for positive sentiment reviews.', fontsize=16)

plt.show()

In [None]:
fig = plt.figure(figsize=(14,7))
df['length'] = df.text.str.split().apply(len)
ax1 = fig.add_subplot(122)
sns.histplot(df[df['sentiment']==-1]['length'], ax=ax1,color='magenta')
describe = df.length[df.sentiment==-1].describe().to_frame().round(2)

ax2 = fig.add_subplot(121)
ax2.axis('off')
font_size = 14
bbox = [0, 0, 1, 1]
table = ax2.table(cellText = describe.values, rowLabels = describe.index, bbox=bbox, colLabels=describe.columns)
table.set_fontsize(font_size)
fig.suptitle('Distribution of text length for negative sentiment reviews.', fontsize=16)

plt.show()



---



---


**TF_IDF vectorization and logistic regression**


---



---



In [31]:
# loading the spacy model for tokenization
nlp = spacy.load("en_core_web_sm")

lemmatizer = WordNetLemmatizer()

# list of common negations (we wish to keep them)
negations = {"not", "no", "n't", "isn't", "aren't", "won't", "can't", "never", "nothing"}

def review_to_words(review):

    # removing HTML tags and hashtags
    review = re.sub(r"<.*?>", " ", review)
    review = re.sub(r"(@[A-Za-z0-9_]+)|(#\S+)", " ", review)

    doc = nlp(review.lower())

    clean_tokens = []
    for token in doc:
        # removing punctuation, spaces and stopwords (excluding negations)
        if token.is_punct or token.is_space:
            continue
        if token.is_stop and token.text not in negations:
            continue
        # lematizare
        clean_tokens.append(token.lemma_)

    return clean_tokens

df_sampled['processed_text'] = df_sampled['text'].apply(review_to_words)

# checking that preprocessing worked
print(df_sampled[['text', 'processed_text']].head())

#transforming tokens into strings
df_sampled['processed_text'] = df_sampled['processed_text'].apply(lambda tokens: ' '.join(tokens))

#splitting the dataset
X_text_train, X_text_test, y_train, y_test = train_test_split(df_sampled['processed_text'], df_sampled['sentiment'], test_size=0.3)

# vectorization of X_train and X_test
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_text_train)

X_test = vectorizer.transform(X_text_test)

# training the Logistic Regression model
model = LogisticRegression(max_iter=1000, class_weight='balanced', C=1.0)
model.fit(X_train, y_train)

# predictions and evaluation
y_pred_lr = model.predict(X_test)
y_true_lr = y_test
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))

                                                text  \
0  Ever since I remember, I have loved airplanes ...   
1  ...But not this one! I always wanted to know "...   
2  I have seen this film on 3 different occasions...   
3  Three of the things you can say about Spalding...   
4  This movie was the best movie I have ever seen...   

                                      processed_text  
0  [remember, love, airplane, fly, college, priva...  
1  [not, want, know, happen, never, know, sure, h...  
2  [see, film, 3, different, occasions.on, occasi...  
3  [thing, spalde, gray, certainly, march, beat, ...  
4  [movie, good, movie, see, lds, highly, recomme...  
Accuracy: 0.835

Classification Report:
               precision    recall  f1-score   support

          -1       0.87      0.79      0.83       604
           1       0.81      0.88      0.84       596

    accuracy                           0.83      1200
   macro avg       0.84      0.84      0.83      1200
weighted avg       0



---



---



**Verifying a new review.**


---



---



In [None]:
new_text = "I did not enjoy this movie"
print(f"The review is: {new_text}")
tokens = review_to_words(new_text)
prop = ' '.join(tokens)

X_new = vectorizer.transform([prop])

y_pred_new = model.predict(X_new)

print("Preprocessed review:", prop)
print("Prediction for this review:", y_pred_new)



---

False positives and false negatives


---



In [None]:
# resetting the index to match X_test
df_test = df_sampled.iloc[y_test.index].copy()
df_test["true_label"] = y_test.values
df_test["predicted_label"] = y_pred_lr

# false positives:
false_positives = df_test[(df_test["true_label"] == -1) & (df_test["predicted_label"] == 1)]

# false negatives:
false_negatives = df_test[(df_test["true_label"] == 1) & (df_test["predicted_label"] == -1)]

# printing a few examples
print("FALSE POSITIVES (predicted 1, true -1):")
print(false_positives[["text", "true_label", "predicted_label"]].head(5))

print("\nFALSE NEGATIVES (predicted -1, true 1):")
print(false_negatives[["text", "true_label", "predicted_label"]].head(5))
#pd.set_option("display.max_colwidth", None)

Handling negations.

In [33]:
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

#function that replaces negated words with their antonyms
def Negation(sentence):

    i = 1
    while i < len(sentence):
        if sentence[i-1] in ['not', "n't"]:
            antonyms = []

            for syn in wn.synsets(sentence[i]):
                for lemma in syn.lemmas():
                    if lemma.antonyms():
                        antonyms.append(lemma.antonyms()[0].name())

            if 'bad' in antonyms:
                sentence[i] = 'bad'
            elif antonyms:
                sentence[i] = antonyms[0]
            sentence[i-1] = ''
        i += 1

    return ' '.join([word for word in sentence if word])

df_sampled = df_sampled.drop('processed_text', axis=1)

df_sampled['processed_text'] = df_sampled['text'].apply(review_to_words)
df_sampled['processed_text'] = df_sampled['processed_text'].apply(Negation)

X_text_train, X_text_test, y_train, y_test = train_test_split(df_sampled['processed_text'], df_sampled['sentiment'], test_size=0.3)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_text_train)

X_test = vectorizer.transform(X_text_test)

# training the new model (that handles negations)
model = LogisticRegression(max_iter=1000, class_weight='balanced', C=1.0)
model.fit(X_train, y_train)

# evaluation
y_pred_lr = model.predict(X_test)
y_true_lr = y_test
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))

Accuracy: 0.8566666666666667

Classification Report:
               precision    recall  f1-score   support

          -1       0.88      0.84      0.86       617
           1       0.84      0.88      0.86       583

    accuracy                           0.86      1200
   macro avg       0.86      0.86      0.86      1200
weighted avg       0.86      0.86      0.86      1200



Bag of Words + Logistic Regression

In [32]:
from sklearn.feature_extraction.text import CountVectorizer

X_text_train_bow, X_text_test_bow, y_train_bow, y_test_bow = train_test_split(df_sampled['processed_text'], df_sampled['sentiment'], test_size=0.3)

vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_bow = vectorizer.fit_transform(X_text_train_bow)

X_test_bow = vectorizer.transform(X_text_test_bow)

model = LogisticRegression(max_iter=1000, class_weight='balanced', C=1.0)
model.fit(X_train_bow, y_train_bow)

y_pred_bow = model.predict(X_test_bow)
y_true_bow = y_test_bow
print("Accuracy:", accuracy_score(y_test_bow, y_pred_bow))
print("\nClassification Report:\n", classification_report(y_test_bow, y_pred_bow))

Accuracy: 0.8375

Classification Report:
               precision    recall  f1-score   support

          -1       0.83      0.85      0.84       607
           1       0.84      0.83      0.83       593

    accuracy                           0.84      1200
   macro avg       0.84      0.84      0.84      1200
weighted avg       0.84      0.84      0.84      1200



TF-IDF + Naive Bayes

In [34]:
from sklearn.naive_bayes import MultinomialNB

X_text_train_nb, X_text_test_nb, y_train_nb, y_test_nb = train_test_split(df_sampled['processed_text'], df_sampled['sentiment'], test_size=0.3)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_nb = vectorizer.fit_transform(X_text_train_nb)

X_test_nb = vectorizer.transform(X_text_test_nb)

model = MultinomialNB()
model.fit(X_train_nb, y_train_nb)

y_pred_nb = model.predict(X_test_nb)
y_true_nb = y_test_nb
print("Accuracy:", accuracy_score(y_test_nb, y_pred_nb))
print("\nClassification Report:\n", classification_report(y_test_nb, y_pred_nb))

Accuracy: 0.8483333333333334

Classification Report:
               precision    recall  f1-score   support

          -1       0.82      0.87      0.85       583
           1       0.87      0.82      0.85       617

    accuracy                           0.85      1200
   macro avg       0.85      0.85      0.85      1200
weighted avg       0.85      0.85      0.85      1200



BoW cu Naive Bayes

In [35]:
X_text_train_nb, X_text_test_nb, y_train_nb, y_test_nb = train_test_split(df_sampled['processed_text'], df_sampled['sentiment'], test_size=0.3)

vectorizer = CountVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_nb = vectorizer.fit_transform(X_text_train_nb)

X_test_nb = vectorizer.transform(X_text_test_nb)

model = MultinomialNB()
model.fit(X_train_nb, y_train_nb)

y_pred_nb = model.predict(X_test_nb)
y_true_nb = y_test_nb
print("Accuracy:", accuracy_score(y_test_nb, y_pred_nb))
print("\nClassification Report:\n", classification_report(y_test_nb, y_pred_nb))

Accuracy: 0.8341666666666666

Classification Report:
               precision    recall  f1-score   support

          -1       0.83      0.85      0.84       598
           1       0.84      0.82      0.83       602

    accuracy                           0.83      1200
   macro avg       0.83      0.83      0.83      1200
weighted avg       0.83      0.83      0.83      1200





---



---


**Adding a dependency tree derived feature.**


---


---




In [36]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    return text.lower()

# we extract SVO triplets
def extract_dependency_features(text):
    doc = nlp(text)
    triples = []
    for token in doc:
        if token.dep_ == "ROOT" and token.pos_ == "VERB":
            subject = [child for child in token.children if "subj" in child.dep_]
            obj = [child for child in token.children if "obj" in child.dep_ or "attr" in child.dep_]
            if subject and obj:
                s = subject[0].lemma_
                v = token.lemma_
                o = obj[0].lemma_
                triples.append(f"{s}_{v}_{o}")

    return " ".join(triples)

def preprocess_with_dependencies(text):
    text_clean = clean_text(text)
    tokens = text_clean.split()
    tokens_neg_handled = Negation(tokens)
    text_neg_handled = tokens_neg_handled
    text_lemmas = [lemmatizer.lemmatize(word) for word in text_neg_handled.split() if word not in stop_words]
    triple_features = extract_dependency_features(text_neg_handled)
    return " ".join(text_lemmas) + " " + triple_features

df_sampled['processed_text_tree'] = df_sampled['text'].apply(preprocess_with_dependencies)

X_train, X_test, y_train, y_test = train_test_split(df_sampled['processed_text_tree'], df_sampled['sentiment'], test_size=0.3)

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1000, class_weight='balanced', C=1.0)
model.fit(X_train_vec, y_train)

# Predicții și metrice
y_pred_tree = model.predict(X_test_vec)
y_true_tree=y_test
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_tree))

Accuracy: 0.8675

Classification Report:

              precision    recall  f1-score   support

          -1       0.89      0.84      0.87       616
           1       0.84      0.89      0.87       584

    accuracy                           0.87      1200
   macro avg       0.87      0.87      0.87      1200
weighted avg       0.87      0.87      0.87      1200



In [None]:
df_test_tree = df_sampled.iloc[y_test.index].copy()
df_test_tree["true_label"] = y_test.values
df_test_tree["predicted_label"] = y_pred_tree

# False positives
false_positives = df_test_tree[(df_test_tree["true_label"] == -1) & (df_test_tree["predicted_label"] == 1)]

# False negatives
false_negatives = df_test_tree[(df_test_tree["true_label"] == 1) & (df_test_tree["predicted_label"] == -1)]

# Printing a few examples
print("FALSE POSITIVES (predicted 1, true -1):")
print(false_positives[["text", "true_label", "predicted_label"]].head(5))

print("\nFALSE NEGATIVES (predicted -1, true 1):")
print(false_negatives[["text", "true_label", "predicted_label"]].head(5))



---



---
Transformers


---



---




In [None]:
!pip uninstall -y transformers
!pip install transformers datasets -q
from transformers import TrainingArguments
import transformers
print(transformers.__version__)

In [37]:
import os
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# desabling wandb (external logging)
os.environ["WANDB_DISABLED"] = "true"

# we need the labels to be 0 and 1
df_test_bert = df_sampled.copy()
df_test_bert['label'] = df_test_bert['sentiment'].map({-1: 0, 1: 1})

# train/test split
train_df, test_df = train_test_split(df_test_bert, test_size=0.2, stratify=df_test_bert['label'])

train_dataset = Dataset.from_pandas(train_df[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

# tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True, padding=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# model and data collator
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# hyperparameters
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10
)

# trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

# evaluation
preds_output = trainer.predict(test_dataset)
y_pred_bert = preds_output.predictions.argmax(-1)
y_true_bert = preds_output.label_ids

df_test_bert = test_df.copy()

df_test_bert["true_label"] = y_true_bert
df_test_bert["predicted_label"] = y_pred_bert

df_test_bert["true_label"] = df_test_bert["true_label"].map({0: -1, 1: 1})
df_test_bert["predicted_label"] = df_test_bert["predicted_label"].map({0: -1, 1: 1})

print("\nclassification report:")
print(classification_report(y_true_bert, y_pred_bert, target_names=["Negative", "Positive"]))

Map:   0%|          | 0/3200 [00:00<?, ? examples/s]

Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
10,0.6907
20,0.7096
30,0.6725
40,0.5792
50,0.3964
60,0.329
70,0.2744
80,0.3893
90,0.3699
100,0.2809



classification report:
              precision    recall  f1-score   support

    Negative       0.90      0.89      0.90       400
    Positive       0.89      0.91      0.90       400

    accuracy                           0.90       800
   macro avg       0.90      0.90      0.90       800
weighted avg       0.90      0.90      0.90       800



In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

ConfusionMatrixDisplay.from_predictions(y_true_bert, y_pred_bert, display_labels=["Negative", "Positive"])
plt.title("Confusion Matrix - Transformers (BERT)")
plt.show()


Training loss plot.

In [None]:
import matplotlib.pyplot as plt

log_history = trainer.state.log_history

steps = []
losses = []

for entry in log_history:
    if "loss" in entry and "epoch" in entry:
        steps.append(entry["step"])
        losses.append(entry["loss"])

plt.figure(figsize=(8, 5))
plt.plot(steps, losses, marker='o')
plt.title("Training Loss per Step")
plt.xlabel("Step")
plt.ylabel("Loss")
plt.grid(True)
plt.tight_layout()
plt.show()


---



**Comparing the models.**

---



In [None]:
import pandas as pd
from sklearn.metrics import classification_report

y_pred_lr = y_pred_lr.ravel()
y_pred_tree = y_pred_tree.ravel()
y_pred_bert = y_pred_bert.ravel()

report_lr = classification_report(y_true_lr, y_pred_lr, target_names=["Negative", "Positive"], output_dict=True)
report_tree = classification_report(y_true_tree, y_pred_tree, target_names=["Negative", "Positive"], output_dict=True)
report_bert = classification_report(y_true_bert, y_pred_bert, target_names=["Negative", "Positive"], output_dict=True)

def extract_metrics(report, model_name):
    rows = []
    for cls in ["Negative", "Positive"]:
        row = {
            "Model": model_name,
            "Class": cls,
            "Precision": round(report[cls]["precision"], 2),
            "Recall": round(report[cls]["recall"], 2),
            "F1-Score": round(report[cls]["f1-score"], 2),
            "Support": int(report[cls]["support"])
        }
        rows.append(row)
    return rows

data = []
data += extract_metrics(report_lr, "Logistic Regression")
data += extract_metrics(report_tree, "Dependency Tree")
data += extract_metrics(report_bert, "Transformers")

comparison_df = pd.DataFrame(data)

print(comparison_df)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

false_positives_lr = df_test[(df_test["true_label"] == -1) & (df_test["predicted_label"] == 1)]
false_negatives_lr = df_test[(df_test["true_label"] == 1) & (df_test["predicted_label"] == -1)]

false_positives_tree = df_test_tree[(df_test_tree["true_label"] == -1) & (df_test_tree["predicted_label"] == 1)]
false_negatives_tree = df_test_tree[(df_test_tree["true_label"] == 1) & (df_test_tree["predicted_label"] == -1)]

false_positives_bert = df_test_bert[(df_test_bert["true_label"] == -1) & (df_test_bert["predicted_label"] == 1)]
false_negatives_bert = df_test_bert[(df_test_bert["true_label"] == 1) & (df_test_bert["predicted_label"] == -1)]

#total number of errors
false_pos_lr_count = len(false_positives_lr)
false_neg_lr_count = len(false_negatives_lr)
false_pos_tree_count = len(false_positives_tree)
false_neg_tree_count = len(false_negatives_tree)
false_pos_bert_count = len(false_positives_bert)
false_neg_bert_count = len(false_negatives_bert)

# comparative dataframe
comparison_data = {
    "Model": ["Logistic Regression", "Logistic Regression",
              "Dependency Tree", "Dependency Tree",
              "Transformers", "Transformers"],
    "Error Type": ["False Positives", "False Negatives",
                   "False Positives", "False Negatives",
                   "False Positives", "False Negatives"],
    "Count": [false_pos_lr_count, false_neg_lr_count,
              false_pos_tree_count, false_neg_tree_count,
              false_pos_bert_count, false_neg_bert_count]
}

comparison_df = pd.DataFrame(comparison_data)

print(comparison_df)

fig, ax = plt.subplots(1, 2, figsize=(14, 6))

ax[0].bar(['LR', 'Tree', 'BERT'],
          [false_pos_lr_count, false_pos_tree_count, false_pos_bert_count],
          color=['blue', 'orange', 'green'])
ax[0].set_title("False Positives Comparison")
ax[0].set_ylabel("Count")
ax[0].set_xlabel("Model")

ax[1].bar(['LR', 'Tree', 'BERT'],
          [false_neg_lr_count, false_neg_tree_count, false_neg_bert_count],
          color=['blue', 'orange', 'green'])
ax[1].set_title("False Negatives Comparison")
ax[1].set_ylabel("Count")
ax[1].set_xlabel("Model")

plt.tight_layout()
plt.show()

# pie chart
labels = ['LR FP', 'Tree FP', 'BERT FP', 'LR FN', 'Tree FN', 'BERT FN']
sizes = [false_pos_lr_count, false_pos_tree_count, false_pos_bert_count,
         false_neg_lr_count, false_neg_tree_count, false_neg_bert_count]
colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#aec7e8', '#ffbb78', '#98df8a']

fig, ax = plt.subplots(figsize=(8, 8))
ax.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90, colors=colors, wedgeprops={'edgecolor': 'black'})
ax.set_title("Error Type Distribution across Models")
plt.show()