In [1]:
!pip install transformers



In [2]:
!pip install accelerate>=0.26.0

In [3]:
pip install --upgrade transformers

Note: you may need to restart the kernel to use updated packages.


In [10]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import torch
import pandas as pd

In [12]:
df = pd.read_csv('/workspaces/misinformation/df_2829_manualCoding.csv')

In [9]:
# List of moral dimensions (foundations)
FOUNDATIONS = ["care", "fairness", "loyalty", "authority", "sanctity"]
MODEL_BASE = "joshnguyen/mformer-"


# Load the tokenizer and model for the given moral dimensions
tokenizer = AutoTokenizer.from_pretrained(MODEL_BASE + FOUNDATIONS[0])  # Assuming the same tokenizer can be used
models = {foundation: AutoModelForSequenceClassification.from_pretrained(MODEL_BASE + foundation, device_map="auto") for foundation in FOUNDATIONS}



In [13]:
def classify_text(text, tokenizer, models):
    # Tokenize the text
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        return_tensors='pt'
    ).to('cuda' if torch.cuda.is_available() else 'cpu')

    # Dictionary to store probabilities for each foundation
    results = {}

    for foundation in FOUNDATIONS:
        model = models[foundation]
        model.eval()

        # Forward pass
        with torch.no_grad():
            outputs = model(**inputs)

        # Calculate class probabilities
        probs = torch.softmax(outputs.logits, dim=1)[:, 1]  # Assuming binary classification, class 1 represents the foundation
        results[foundation] = probs.item()  # Store as a float

    return results

In [17]:
subset_df = df.head(10)

In [18]:
subset_df[FOUNDATIONS] = subset_df['text'].apply(lambda x: pd.Series(classify_text(x, tokenizer, models)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df[FOUNDATIONS] = subset_df['text'].apply(lambda x: pd.Series(classify_text(x, tokenizer, models)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_df[FOUNDATIONS] = subset_df['text'].apply(lambda x: pd.Series(classify_text(x, tokenizer, models)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-

In [19]:
subset_df.head()

Unnamed: 0.1,origin,Unnamed: 0,id,text,retweetCount,CreatedAt,author.id,author.userName,author.followers,author.following,...,date,createdDate,misinformation,stance,isUSRelated,care,fairness,loyalty,authority,sanctity
0,T2,1224387,1479667332293160967,Fox News: Harris comms chief apologizes after ...,0,Sat Jan 08 04:12:37 +0000 2022,303850691,01splcheck,7806,6911,...,2022-01-07 20:12:37,2022-01-07,T,U,1,0.245578,0.802129,0.281535,0.167433,0.02956
1,T4,332659,1778181695575105704,@Cartel_Cal @RepRobertGarcia It seems he's in ...,0,Wed Apr 10 22:02:07 +0000 2024,303850691,01splcheck,7789,6899,...,2024-04-10 15:02:07,2024-04-10,CF,A,1,0.424815,0.639714,0.178733,0.58772,0.133115
2,T3,1567887,1656440842830376960,@NatlyDenise_ Ready to bring illegal immigrant...,0,Wed May 10 23:27:26 +0000 2023,303850691,01splcheck,7814,6921,...,2023-05-10 16:27:26,2023-05-10,CF,A,1,0.146103,0.761928,0.12126,0.367018,0.308492
3,T2,1303080,1477882502807052290,"@cassstastrophe Well, you can take comfort tha...",0,Mon Jan 03 06:00:21 +0000 2022,303850691,01splcheck,7806,6911,...,2022-01-02 22:00:21,2022-01-02,E,A,1,0.484881,0.741862,0.054717,0.463428,0.167075
4,T3,1578404,1656441829464870912,I see a press conference with Arizona official...,1,Wed May 10 23:31:22 +0000 2023,303850691,01splcheck,7814,6921,...,2023-05-10 16:31:22,2023-05-10,UV,A,1,0.191054,0.355418,0.295396,0.501547,0.328975
