# simple model
This is a simple NN model. It takes in a long paragraph, break into separate sentences, predict probability of mistakes for each sentence, and combine the probability with OR-like function to get the final probability of whether the paragraph contains misconception/vague knowledge. Then, the result is compared to the paragraph-level label and do backward propagation. Use BERT-base-uncased as base model

### install packages

In [7]:
!pip install pandas
!pip install transformers
!pip install python-docx

Collecting python-docx
  Using cached python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Collecting lxml>=3.1.0 (from python-docx)
  Using cached lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Collecting typing-extensions>=4.9.0 (from python-docx)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Using cached python_docx-1.1.2-py3-none-any.whl (244 kB)
Using cached lxml-5.3.0-cp311-cp311-manylinux_2_28_x86_64.whl (5.0 MB)
Using cached typing_extensions-4.12.2-py3-none-any.whl (37 kB)
Installing collected packages: typing-extensions, lxml, python-docx
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.8.0
    Uninstalling typing_extensions-4.8.0:
      Successfully uninstalled typing_extensions-4.8.0
Successfully installed lxml-5.3.0 python-docx-1.1.2 typing-extensions-4.12.2


### import packages

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import docx
import json
import os.path


import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn

In [2]:
torch.cuda.is_available()

True

In [3]:
### helper functions
# verify dir name, make sure it ends with one '/'
def dir_valid(input_dir):
  if input_dir[-1]!='/':
    input_dir = input_dir + '/'
  return input_dir

# generate file name from year and month
def get_file_unit(yr, mt, header='RS_'):
  if isinstance(mt, int):
    file_unit = header + str(yr) + '-' + str(mt).zfill(2)
  else:
    file_unit = header + str(yr) + '-' + mt
  return file_unit



In [6]:
# input
# read from file RS_2023-04_records.json
file_unit = get_file_unit('2023', 4)
file_json = file_unit + '_records.json'

df = pd.read_json(file_json, orient='records', lines=True)

In [25]:
df_sub = df.iloc[:160,]

In [24]:
df.shape

(197, 13)

In [26]:
# Example: Paragraphs and labels
paragraphs = df_sub['selftext'].values
labels = []

for ii in range(df_sub.shape[0]):
    misconception_label = df_sub.at[ii,'misconception']
    unclear_knowledge_label = df_sub.at[ii, 'unclear knowledge']
    if (misconception_label != 'n/a') or (unclear_knowledge_label != 'n/a'):
        label = 1
    else:
        label = 0
    labels.append(label)


In [27]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
base_model = AutoModel.from_pretrained("bert-base-uncased")

# Custom model with sentence and paragraph-level prediction
class SentenceToParagraphModel(nn.Module):
    def __init__(self, base_model, hidden_dim=768):
        super(SentenceToParagraphModel, self).__init__()
        self.base_model = base_model
        self.sentence_fc = nn.Linear(hidden_dim, 1)  # Sentence-level prediction (binary)
        self.paragraph_fc = nn.Linear(hidden_dim, 1)  # Paragraph-level prediction (binary)

    def forward(self, input_ids, attention_mask):
        # BERT model output for each token in each sentence
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token as sentence embedding

        # Sentence-level predictions
        sentence_logit = self.sentence_fc(sentence_embedding).squeeze(-1)
        return sentence_logit

In [28]:
# Initialize the custom model
model = SentenceToParagraphModel(base_model)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

In [29]:
model.to('cuda')

SentenceToParagraphModel(
  (base_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1

In [22]:
print(f"Model is on device: {next(model.parameters()).device}")

Model is on device: cuda:0


In [30]:
# Example: Training loop for paragraph-level supervision
model.train()
for epoch in range(10):  # Example for 3 epochs
    total_loss = 0
    for paragraph, label in zip(paragraphs, labels):
        # Split paragraph into sentences
        sentences = paragraph.split('.')  # Simplified sentence splitting
        sentences = [s.strip() for s in sentences if s]

        sentence_probs = []
        
        # Predict each sentence separately
        for sentence in sentences:
            # Tokenize the sentence
            inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=128)
            input_ids = inputs["input_ids"].to("cuda")  # Move input to GPU
            attention_mask = inputs["attention_mask"].to("cuda")
            
            # Forward pass for sentence
            #sentence_logit = model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
            sentence_logit = model(input_ids, attention_mask)
            sentence_prob = torch.sigmoid(sentence_logit)
            sentence_probs.append(sentence_prob)

        # Combine sentence probabilities (using OR-like aggregation)
        paragraph_prob = 1 - torch.prod(1 - torch.stack(sentence_probs))
        # Ensure paragraph_prob is reshaped to match label_tensor shape
        paragraph_prob = paragraph_prob.view(1)  # Reshape to (1,) if it's a scalar

        # Compute loss using the paragraph label
        label_tensor = torch.tensor([label], dtype=torch.float).to(paragraph_prob.device)
        paragraph_loss = criterion(paragraph_prob, label_tensor)
        total_loss += paragraph_loss.item()

        # Backpropagation
        paragraph_loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(paragraphs)}")

Epoch 1, Loss: 0.8279798725619912
Epoch 2, Loss: 0.695451258122921
Epoch 3, Loss: 0.6947487272322178
Epoch 4, Loss: 0.6944276183843613
Epoch 5, Loss: 0.694217799976468
Epoch 6, Loss: 0.6940627809613943
Epoch 8, Loss: 0.6938336815685033
Epoch 9, Loss: 0.6937485881149769
Epoch 10, Loss: 0.6936761248856783


In [36]:
model.eval()  # Step 2: Set the model to evaluation mode

# Example input (a long text you want to predict on)
input_text = df.at[191,'selftext']

# Step 3: Tokenize and prepare the input data
sentences = input_text.split('.')  # Break into sentences
sentences = [s.strip() for s in sentences if s]

# Store predictions
predictions = []

with torch.no_grad():  # Disable gradient calculations for inference
    for sentence in sentences:
        inputs = tokenizer(sentence, padding=True, truncation=True, return_tensors="pt", max_length=128)
        
        # Move inputs to GPU if applicable
        input_ids = inputs["input_ids"].to("cuda")
        attention_mask = inputs["attention_mask"].to("cuda")

        # Step 4: Make predictions
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        
        # Apply sigmoid to get probabilities (if using binary classification)
        probabilities = torch.sigmoid(outputs)

        # Store the predictions
        predictions.append(probabilities.item())

# Step 5: Interpret the output
# For example, if your threshold for labeling a sentence as having a misconception is 0.5:
#labels = ["Mistake" if prob > 0.5 else "No Mistake" for prob in predictions]

# Output the results
#for sentence, prob, label in zip(sentences, predictions, labels):
#    print(f"Sentence: '{sentence}' - Probability: {prob:.4f} - Label: {label}")
paragraph_prob = 1 - np.prod(1 - np.array(predictions))

In [37]:
paragraph_prob

1.1966353667958174e-05

In [38]:
df.columns

Index(['author', 'created_utc', 'title', 'selftext', 'note', 'jurisdictions',
       'relevance', 'poster's legal status', 'misconception',
       'unclear knowledge', 'category', 'background', 'underlined'],
      dtype='object')

In [39]:
df.at[191,'misconception']

'n/a'

In [40]:
df.at[191,'unclear knowledge']

'n/a'

In [None]:
# Save model and optimizer states
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'epoch': epoch,  # Optionally save the current epoch
}, 'model_checkpoint.pth')


In [None]:
# Load the model and optimizer states
checkpoint = torch.load('model_checkpoint.pth')

# Load the model state
model.load_state_dict(checkpoint['model_state_dict'])

# Load the optimizer state
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Optionally load the last epoch
start_epoch = checkpoint['epoch']

# Set the model to training mode
model.train()

# then, continue training with more code