In [1]:
import json
import pandas as pd
import re
import nltk
from nltk import ngrams
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
from collections import Counter
import torch
from transformers import pipeline
from itertools import combinations

torch.manual_seed(42)


# Check if CUDA is available
device = 0 if torch.cuda.is_available() else -1
print(f"The code is running using '{device}'")

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

nlp = spacy.load('en_core_web_sm')  # Load spaCy's English model


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load configuration from JSON file
with open('config-biometric.json', 'r') as f:
    config = json.load(f)

workspace = config["workspace"]

col = ["id", "label", "statement", "date", "subject", "speaker", "speaker_description", "state_info", "true_counts", "mostly_true_counts", "half_true_counts", "mostly_false_counts", "false_counts", "pants_on_fire_counts", "context", "justification"]


train_data = pd.read_csv(workspace + 'train.csv')
test_data = pd.read_csv(workspace + 'test.csv')
val_data = pd.read_csv(workspace + 'valid.csv')


  from .autonotebook import tqdm as notebook_tqdm


The code is running using '0'


[nltk_data] Downloading package stopwords to /home/btlab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/btlab/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/btlab/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/btlab/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /home/btlab/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/btlab/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [3]:
train_data.columns

Index(['id', 'label', 'statement', 'date', 'subject', 'speaker',
       'speaker_description', 'state_info', 'true_counts',
       'mostly_true_counts', 'half_true_counts', 'mostly_false_counts',
       'false_counts', 'pants_on_fire_counts', 'context', 'justification',
       'statement-clean', 'frequent_trigrams', 'ttr', 'exclamation_count',
       'adjective_count', 'sentiment_label', 'sentiment_score',
       'subjectivity_score', 'contradiction_score'],
      dtype='object')

In [None]:
subjectivity_classifier = pipeline("text-classification", model="HCKLab/BiBert-Subjectivity", device=device)
def get_subjectivity(text):
    """
    Classify the subjectivity of the text.
    """
    result = subjectivity_classifier(text)
    return result[0]['label'] if result else None


Device set to use cuda:0


In [23]:
get_subjectivity("the population of Canada was estimated to be 38 million in 2024")
# train_data['subjectivity'] = train_data['statement'].apply(get_subjectivity)
# train_data['subjectivity'].value_counts()
# train_data['subjectivity'] = train_data['subjectivity'].replace({'objective': 0, 'subjective': 1})
# train_data['subjectivity'].value_counts()

[{'label': 'LABEL_0', 'score': 0.9666300415992737}]

In [26]:
# # Sentiment Analysis

# Load the sentiment analysis pipeline with GPU support
# sentiment_classifier = pipeline("sentiment-analysis", device=device)
subjectivity_classifier = pipeline("text-classification", model="HCKLab/BiBert-Subjectivity", device=device)
def split_sentences(text):
    """
    Split the text into sentences.
    """
    doc = nlp(text.lower())
    sentences = [sent.text for sent in doc.sents]
    return sentences

# Define a subjectivity detection function (this should be replaced with an actual transformer-based model)
def subjectivity_calculation(text):
    # Placeholder for subjectivity detection using a transformer-based model
    # This can be replaced with a fine-tuned transformer model running on GPU
    # Classify the subjectivity of the text.
    subj_count = 0
    sents = split_sentences(text)
    for sent in sents:
        result = subjectivity_classifier(sent)
        if result:
            if result[0]['label'] == 'LABEL_1':  # LABEL_1 is subjective
                subj_count += 1
    # Calculate the subjectivity score
    subjectivity_score = subj_count / len(sents) if len(sents) > 0 else 0
    return subjectivity_score

# Function for sentiment and subjectivity analysis
def subjectivity_analysis(text):
    # if clean:
    #     text = clean_text(text)
    # Subjectivity detection (currently a placeholder)
    subjectivity_score = subjectivity_calculation(text)

    return {
        "subjectivity_score": subjectivity_score,
    }

# Apply analysis efficiently
def analyze_dataframe(df, text_column):
    results = df[text_column].apply(subjectivity_analysis).apply(pd.Series)
    df[['subjectivity_score']] = results
    return df

# Apply to datasets
train_data = analyze_dataframe(train_data, 'statement')
test_data = analyze_dataframe(test_data, 'statement')
val_data = analyze_dataframe(val_data, 'statement')


# Save DataFrames and labels as CSV files
train_data.to_csv(workspace +'train.csv', index=False)
test_data.to_csv(workspace + 'test.csv', index=False)
val_data.to_csv(workspace + 'valid.csv', index=False)

print("Checkpoint: Subjectivity score added")

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cuda:0
Device set to use cuda:0


Checkpoint: Subjectivity score added


In [30]:
train_data['subjectivity_score'].value_counts()

subjectivity_score
0.000000    14624
1.000000     2956
0.500000      546
0.333333      134
0.666667       56
0.250000       22
0.750000        7
0.400000        6
0.200000        6
0.600000        4
0.166667        2
0.142857        1
0.800000        1
0.285714        1
0.300000        1
0.375000        1
0.428571        1
Name: count, dtype: int64