# Detailed analysis of results

Here, the datasets are investigated in detail to potentially highlight why the performance of models differ from topic to topic.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

## Text Length

Invastigate correctness for different text lengths.

In [None]:
import pandas as pd

In [None]:
all_pred_path = "/content/drive/My Drive/BA THESIS/all_predictions.csv"
all_pred_df = pd.read_csv(all_pred_path)
all_pred_df.head()

Add a column  for the word count. Add NB_right and BERT_right columns, these are true for correct classifications and false for incorrect ones.

In [None]:
all_pred_df['word_count'] = all_pred_df['Text'].str.split().str.len()

In [None]:
all_pred_df['NB_right'] = all_pred_df['Actual'] == all_pred_df['NB_pred']
all_pred_df['BERT_right'] = all_pred_df['Actual'] == all_pred_df['BERT_pred']

In [None]:
all_pred_df.head(10)

Now the distribution of word counts is compared for false and correct classifications.

In [None]:
def classification_by_length():

  df_length = pd.DataFrame(columns=['classification', 'average_words', 'median_words', 'shortest', 'longest'])
  for i in (True, False):
    for m in ("NB_right", "BERT_right"):
      sub_df = all_pred_df[all_pred_df[m] == i]
      average_words = sub_df['word_count'].mean()
      median_words = sub_df['word_count'].median()
      shortest = sub_df['word_count'].min()
      longest = sub_df['word_count'].max()
      df_length.loc[len(df_length)] = [ (i,m), average_words, median_words, shortest, longest]
  return df_length


In [None]:
performance_by_length = classification_by_length()
performance_by_length.to_csv("/content/drive/My Drive/BA THESIS/analysis/performance_by_length.csv")
performance_by_length

There seems to be no significant correlation between misclassification and text length. The median of misclassified and correctly classified inputs are almost the same.

In [None]:
# Provide the correct file path
file_path_train = "/content/drive/My Drive/BA THESIS/data/train_BERT.csv"
file_path_test = "/content/drive/My Drive/BA THESIS/data/test_BERT.csv"


# Load CSV
train_df = pd.read_csv(file_path_train)
test_df = pd.read_csv(file_path_test)

In [None]:
all_df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:
def input_lengths (data):
  data['word_count'] = data['text'].str.split().str.len()
  topics = data['topic'].unique()
  df_length = pd.DataFrame(columns=['topic', 'average_words', 'median_words', 'shortest', 'longest'])
  for topic in topics:
    topic_df = data[data['topic'] == topic]
    average_words = topic_df['word_count'].mean()
    median_words = topic_df['word_count'].median()
    shortest = topic_df['word_count'].min()
    longest = topic_df['word_count'].max()
    df_length.loc[len(df_length)] = [topic, average_words, median_words, shortest, longest]
  return df_length


In [None]:
word_counts = input_lengths(all_df)
word_counts.to_csv("/content/drive/My Drive/BA THESIS/analysis/word_counts_train_and_test.csv")
word_counts

In [None]:
# Boxplot of word count

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt

sns.boxplot(x='topic', y='word_count', data=all_df)
plt.xticks(rotation=45)
plt.title('Word Count by Topic')
plt.tight_layout()
plt.show()

## Actual Class

Here it is investigated which class is most often misclassified.

In [None]:
misclassified_NB = all_pred_df[all_pred_df['Actual'] != all_pred_df['NB_pred']]

In [None]:

import matplotlib.pyplot as plt

plt.figure(figsize=(8, 8))
colors = ['#ADD8E6', '#6495ED', '#1E90FF'] #light blue, medium blue, dark blue
plt.pie(misclassified_NB['Actual'].value_counts(), labels=misclassified_NB['Actual'].value_counts().index, autopct='%1.1f%%', startangle=90, colors=colors)
plt.title('Distribution of Actual Classes in Misclassified Samples (Naive Bayes)')
plt.axis('equal')
plt.savefig("/content/drive/My Drive/BA THESIS/analysis/misclassified_NB_pie.png")
plt.show()


It is clear that Naive Bayes has a particularly hard time to correctly classify neutral texts.

In [None]:
misclassified_BERT = all_pred_df[all_pred_df['Actual'] != all_pred_df['BERT_pred']]

In [None]:
plt.figure(figsize=(8, 8))
colors = ['lightgreen', 'green', 'darkgreen'] #light blue, medium blue, dark blue
plt.pie(misclassified_BERT['Actual'].value_counts(), labels=misclassified_BERT['Actual'].value_counts().index, autopct='%1.1f%%', startangle=90, colors=colors)
plt.title('Distribution of Actual Classes in Misclassified Samples (BERT)')
plt.axis('equal')
plt.savefig("/content/drive/My Drive/BA THESIS/analysis/misclassified_BERT_pie.png")
plt.show()


Unlike with naive Bayes, BERT has a relatively uniform distribution of misclassified text across all threee sentiments.

## Vocabulary

Here it is investigated how much repetition there is in words for each topic.

In [None]:
# Provide the correct file path
file_path_train = "/content/drive/My Drive/BA THESIS/data/train_NB_lem.csv"
file_path_test = "/content/drive/My Drive/BA THESIS/data/test_NB_lem.csv"


# Load CSV
train_df = pd.read_csv(file_path_train)
test_df = pd.read_csv(file_path_test)

In [None]:
all_df = pd.concat([train_df, test_df], ignore_index=True)

In [None]:

def analyze_vocabulary(df):

    results = []
    for topic in df['topic'].unique():
        topic_df = df[df['topic'] == topic]
        unique_words = set()
        for text in topic_df['text']:
            if isinstance(text, str):  # Handle potential NaN values
              unique_words.update(text.split())
        observations = len(topic_df)
        results.append([topic, len(unique_words), observations, len(unique_words) / observations if observations > 0 else 0])

    return pd.DataFrame(results, columns=['topic', 'unique_words', 'observations', 'unique_words_per_observation'])

vocabulary_analysis_result = analyze_vocabulary(all_df)
vocabulary_analysis_result


In [None]:
vocabulary_analysis_result.to_csv("/content/drive/My Drive/BA THESIS/analysis/vocabulary_rep_analysis_result.csv")