In [None]:
!pip install pyspellchecker
!pip install vaderSentiment



In [None]:
#Import the library
from datasets import load_dataset
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import nltk
from spellchecker import SpellChecker
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re
from nltk.corpus import stopwords
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
train_dataset = load_dataset("climatebert/climate_sentiment", split="train")
test_dataset = load_dataset("climatebert/climate_sentiment", split="test")

In [None]:
train_df = train_dataset.to_pandas()

In [None]:
test_df = test_dataset.to_pandas()

#Data exploration

## Dataset and text overview

In [None]:
#Length of samples
train_len = len(train_df)
test_len = len(test_df)
print('Number of training samples: ', train_len)
print('Number of testing samples: ', test_len)

Number of training samples:  1000
Number of testing samples:  320


In [None]:
# Distribution of the label - training
train_label_stats = train_df['label'].value_counts().to_frame('count')
train_label_stats['percentage'] = train_df['label'].value_counts(normalize=True) * 100

In [None]:
# Distribution of the label - testing
test_label_stats = test_df['label'].value_counts().to_frame('count')
test_label_stats['percentage'] = test_df['label'].value_counts(normalize=True) * 100

In [None]:
label_compare = train_label_stats.join(
    test_label_stats,
    how="outer",
    lsuffix="_train",
    rsuffix="_test"
)

label_compare

Unnamed: 0_level_0,count_train,percentage_train,count_test,percentage_test
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,342,34.2,106,33.125
1,408,40.8,163,50.9375
2,250,25.0,51,15.9375


There is a distribution mismatch between the training and testing sets:

* The test set has much more Label 1 than the training set.

* The test set has significantly fewer Label 2 samples.

* Label 0 remains balanced across both splits.

This label imbalance could affect evaluation for labels 1 and 2:

* Model may appear better on Label 1 due to higher representation.

* Model may appear weaker on Label 2 because test has fewer examples.

In [None]:
#Check null data
train_null = train_df.isna().sum()
test_null = test_df.isna().sum()

In [None]:
print('Null data of train set: ', train_null)
print('Null data of train set: ', test_null)

Null data of train set:  text     0
label    0
dtype: int64
Null data of train set:  text     0
label    0
dtype: int64


In [None]:
#Character and word length
# Character length
train_df['char_len'] = train_df['text'].str.len()
test_df['char_len'] = test_df['text'].str.len()

# Word length
train_df['word_len'] = train_df['text'].str.split().apply(len)
test_df['word_len'] = test_df['text'].str.split().apply(len)

In [None]:
length_compare = pd.DataFrame({
    'train_char_mean': [train_df['char_len'].mean()],
    'test_char_mean': [test_df['char_len'].mean()],
    'train_word_mean': [train_df['word_len'].mean()],
    'test_word_mean': [test_df['word_len'].mean()],
})

length_compare

Unnamed: 0,train_char_mean,test_char_mean,train_word_mean,test_word_mean
0,479.103,530.296875,72.224,79.2125


The test set has slightly longer texts than the training set. The average character length increases from 479 in train to 530 in test, and the average word count increases from 72 to 79. This suggests that the test set is more verbose, which represents a mild distribution shift that could impact evaluation if the model performs differently on longer inputs.

In [None]:
#Check proportion of very short/ long example
short_thresh = train_df['word_len'].quantile(0.10)
long_thresh = train_df['word_len'].quantile(0.90)

print("Short threshold:", short_thresh)
print("Long threshold:", long_thresh)

Short threshold: 37.900000000000006
Long threshold: 113.0


In [None]:
def proportion_short_long(df, short_thresh, long_thresh):
    short_pct = (df['word_len'] <= short_thresh).mean() * 100
    long_pct = (df['word_len'] >= long_thresh).mean() * 100
    return short_pct, long_pct

train_short, train_long = proportion_short_long(train_df, short_thresh, long_thresh)
test_short, test_long = proportion_short_long(test_df, short_thresh, long_thresh)

print("Train very short:", round(train_short, 2), "%")
print("Train very long:", round(train_long, 2), "%")
print("Test very short:", round(test_short, 2), "%")
print("Test very long:", round(test_long, 2), "%")

Train very short: 10.0 %
Train very long: 10.3 %
Test very short: 9.69 %
Test very long: 14.06 %


The test set contains a notably higher proportion of very long samples (14.06%) compared to the training set (10.30%). This implies that the test data includes more unusually long and potentially more complex texts. Such distribution shift in the upper tail may influence model evaluation — longer texts often contain more complex structures and may require more contextual capacity

In [None]:
# Type–Token Ratio to check the vocabulary diversity
def compute_ttr(df):
    tokens = " ".join(df['text']).split()
    total = len(tokens)
    types = len(set(tokens))
    return types / total

train_ttr = compute_ttr(train_df)
test_ttr = compute_ttr(test_df)

print("Train TTR:", train_ttr)
print("Test TTR:", test_ttr)

Train TTR: 0.15453311918475854
Test TTR: 0.23062963547419915


This difference suggests that the test set contains richer and less repetitive language compared to the training data. In contrast, the training set appears to have more repeated vocabulary or more formulaic expressions.

## Vocabulary Insight

In [None]:
def analyze_words(df):
    # tokenize text
    tokenized = df['text'].str.split()
    all_tokens = [tok for row in tokenized for tok in row]

    # word frequencies
    freq = Counter(all_tokens)

    # most common 20 words
    most_common = freq.most_common(20)

    # rare words (appear only once)
    rare = [w for w, c in freq.items() if c == 1]

    # stopword proportion
    total_tokens = len(all_tokens)
    stopword_count = sum(1 for w in all_tokens if w.lower() in stop_words)
    stopword_pct = stopword_count / total_tokens * 100

    result = {
        "most_common": most_common,
        "num_rare_words": len(rare),
        "stopword_percentage": stopword_pct,
        "vocab_size": len(freq),
        "total_tokens": total_tokens,
        "rare_words": rare   # keep list in case you want to inspect
    }

    return result

In [None]:
train_stats = analyze_words(train_df)
test_stats = analyze_words(test_df)

In [None]:
print("=== TRAIN SET ===")
print("Most common words:", train_stats["most_common"])
print("Number of rare words:", train_stats["num_rare_words"])
print("Stopword proportion:", train_stats["stopword_percentage"], "%")
print("Vocabulary size:", train_stats["vocab_size"])
print("Total tokens:", train_stats["total_tokens"])
print("=== TEST SET ===")
print("Most common words:", test_stats["most_common"])
print("Number of rare words:", test_stats["num_rare_words"])
print("Stopword proportion:", test_stats["stopword_percentage"], "%")
print("Vocabulary size:", test_stats["vocab_size"])
print("Total tokens:", test_stats["total_tokens"])

=== TRAIN SET ===
Most common words: [('the', 3214), ('and', 2957), ('to', 2366), ('of', 2250), ('in', 1537), ('a', 1069), ('our', 829), ('for', 679), ('on', 633), ('is', 572), ('with', 568), ('by', 516), ('as', 481), ('climate', 474), ('The', 419), ('are', 417), ('we', 414), ('that', 406), ('from', 341), ('energy', 297)]
Number of rare words: 6234
Stopword proportion: 36.82570890562694 %
Vocabulary size: 11161
Total tokens: 72224
=== TEST SET ===
Most common words: [('the', 1159), ('and', 1156), ('to', 825), ('of', 819), ('in', 485), ('our', 390), ('a', 308), ('for', 235), ('as', 218), ('on', 214), ('are', 200), ('is', 187), ('we', 184), ('with', 169), ('that', 159), ('or', 151), ('by', 148), ('The', 145), ('climate', 133), ('from', 127)]
Number of rare words: 3378
Stopword proportion: 37.789963705223286 %
Vocabulary size: 5846
Total tokens: 25348


In [None]:
def compute_oov_like(df, tokenizer):
    words = [tok for row in df['text'].str.split() for tok in row]
    total = len(words)
    oov_like = 0

    for w in words:
        subwords = tokenizer.tokenize(w)
        if len(subwords) > 1:   # broken into multiple pieces → not in vocab
            oov_like += 1

    return oov_like / total * 100

train_oov = compute_oov_like(train_df, tokenizer)
test_oov = compute_oov_like(test_df, tokenizer)

print("Train OOV-like rate:", train_oov, "%")
print("Test OOV-like rate:", test_oov, "%")

Train OOV-like rate: 16.656513070447495 %
Test OOV-like rate: 16.55751933091368 %


The train and test sets have very similar lexical profiles. The OOV-like rates are almost identical (~16.6%), stopword proportions differ by only 1%, and both splits follow the same long-tailed vocabulary distribution with many rare words. This indicates no major vocabulary or stylistic shift, meaning models trained on the train set should generalize fairly well to the test set.

## Text Quality Analysis

In [None]:
#Check spelling error
spell = SpellChecker()

def spelling_error_rate(df):
    error_rates = []
    for text in df['text']:
        words = text.split()
        if len(words) == 0:
            continue
        errors = spell.unknown(words)
        error_rate = len(errors) / len(words)
        error_rates.append(error_rate)
    return np.mean(error_rates) * 100

In [None]:
train_spelling = spelling_error_rate(train_df)
test_spelling = spelling_error_rate(test_df)

print("Train spelling error rate:", train_spelling, "%")
print("Test spelling error rate:", test_spelling, "%")

Train spelling error rate: 13.748991763428062 %
Test spelling error rate: 14.01883642913296 %


In [None]:
#Check special character
special_pattern = re.compile(r"[^a-zA-Z0-9\s.,!?']")

def special_char_proportion(df):
    specials = df['text'].apply(lambda x: bool(special_pattern.search(x)))
    return specials.mean() * 100

In [None]:
train_special = special_char_proportion(train_df)
test_special = special_char_proportion(test_df)

print("Train special-char proportion:", train_special, "%")
print("Test special-char proportion:", test_special, "%")

Train special-char proportion: 82.89999999999999 %
Test special-char proportion: 83.125 %


Both the training and testing sets contain a high proportion of texts with special characters (~83%), indicating that such characters are a natural part of the dataset rather than noise. The almost identical proportions between the two splits show that the character-level structure of the writing is consistent across the dataset.

In [None]:
#Check repetition
def char_repetition(text):
    return bool(re.search(r"(.)\1{2,}", text))   # 3+ repeated characters

In [None]:
def word_repetition(text):
    words = text.lower().split()
    for i in range(len(words)-2):
        if words[i] == words[i+1] == words[i+2]:
            return True
    return False

In [None]:
def repetition_proportion(df):
    char_rep = df['text'].apply(char_repetition).mean() * 100
    word_rep = df['text'].apply(word_repetition).mean() * 100
    return char_rep, word_rep

In [None]:
train_char_rep, train_word_rep = repetition_proportion(train_df)
test_char_rep, test_word_rep = repetition_proportion(test_df)

print("Train char repetition:", train_char_rep, "%")
print("Train word repetition:", train_word_rep, "%")
print("Test char repetition:", test_char_rep, "%")
print("Test word repetition:", test_word_rep, "%")

Train char repetition: 6.7 %
Train word repetition: 0.1 %
Test char repetition: 4.0625 %
Test word repetition: 0.0 %


Both training and testing sets show very low levels of repeated characters and almost no repeated words. This indicates that the dataset does not contain informal or exaggerated writing patterns, and the stylistic consistency between train and test is high. The slight difference in character repetition (6.7% vs. 4.06%) is minimal and does not suggest any meaningful shift in writing style across the splits.

# Conclusion

The dataset consists of 1000 training samples and 320 testing samples, with two notable distribution shifts between splits. There is a clear label imbalance, as the test set contains disproportionately more Label 1 samples and significantly fewer Label 2 samples compared to the training set, which may bias evaluation by inflating performance on Label 1 and suppressing it for Label 2. Second, the test set includes longer and more complex texts, with higher average length and a larger proportion of very long samples, introducing a mild difficulty shift that could affect models sensitive to input length. Aside from these differences, the two splits are highly consistent: both exhibit similar vocabulary size, OOV-like rates (16.6%), stopword proportions (37%), rare-word distributions, and an unusually high but stable level of special characters (83%), indicating that the writing style and lexical structure are uniform. Repetition levels are also low across both sets, suggesting clean and formal text with no noise.


