In [1]:
import pandas as pd
import numpy as np
import torch

In [2]:
df = pd.read_csv('train.tsv', delimiter='\t', header = None)

In [3]:
df.rename({0: 'id', 1: 'label', 2: 'statement', 3: 'subject', 4: 'speaker', 5: 'job-title',
           6: 'state_info', 7: 'party_affiliation', 8: 'barely_true_counts', 9: 'false_counts',
           10: 'half_true_counts', 11: 'mostly_true_counts', 12: 'pants_on_fire_counts', 13: 'context'
          }, axis = 1, inplace = True)

In [4]:
#make bias column: function of party affiliation and magnitude of falsity 

In [4]:
def make_bias(row):
    if row['label'] == 'true':
        return '(1) factual ' + row['party_affiliation']
    elif row['label'] == 'pants-fire':
        return '(6) very strongly biased ' + row['party_affiliation']
    
    elif row['label'] == 'false':
        return f"(5) biased {row['party_affiliation']}"
    
    elif row['label'] == 'barely-true':
        return f"(4) somewhat biased {row['party_affiliation']}"
    
    elif row['label'] == 'half-true':
        return f"(3) half biased {row['party_affiliation']}"
    
    elif row['label'] == 'mostly-true':
        return f"(2) minimal bias {row['party_affiliation']}"
    

In [5]:
#df['party_affiliation'].value_counts()/df.shape[0]

In [6]:
df['political bias'] = df.apply(make_bias, axis = 1)

In [7]:
#df['political bias'].value_counts()

# Feature extraction

In [7]:
#import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score

In [9]:
#first i should do preprocessing to trim stopwords, lowercase?

In [10]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectorizer.fit(df['statement'])

tfidf_features = tfidf_vectorizer.transform(df['statement'])

In [11]:
#i should maybe be using a different sentiment thing
from nltk.sentiment.vader import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

In [12]:
def calculate_sentiment(text):
    sentiment_scores = analyzer.polarity_scores(text)
    return sentiment_scores['compound']

# Apply the function to your DataFrame and create a new column for sentiment scores
df['sentiment_scores'] = df['statement'].apply(calculate_sentiment)

In [13]:
#df['sentiment_scores']

### BERT

In [8]:
import torch
import transformers as ppb
import warnings

In [9]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, 
                                                    ppb.DistilBertTokenizer, 'distilbert-base-uncased')

In [10]:
tokenizer = tokenizer_class.from_pretrained(pretrained_weights, truncation=True, max_length= 512)
model = model_class.from_pretrained(pretrained_weights, max_length= 512)

In [11]:
batch_1 = df[:1000]

In [12]:
bert_tokenized = batch_1['statement'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, 
                                                                  truncation = True,
                                                                  max_length=512))

In [21]:
#bert_tokenized

In [13]:
max_len = 0
for i in bert_tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in bert_tokenized.values])

In [14]:
attention_mask = np.where(padded != 0, 1, 0)

**Model**

In [15]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")

In [16]:
input_ids = torch.tensor(padded).to(device)  
attention_mask = torch.tensor(attention_mask).to(device)
model = model.to(device)

with torch.no_grad():
    last_hidden_states = model(input_ids, attention_mask=attention_mask)
    

In [17]:
features = last_hidden_states[0][:,0,:].to('cpu').numpy()

**Baseline model**

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [17]:
to_replace = {
    'republican': 1, 'democrat': 2, 'none': 3, 'organization': 4, 'independent': 5,
       'columnist': 6, 'activist': 7, 'talk-show-host': 8, 'libertarian': 9,
       'newsmaker': 10, 'journalist': 11, 'labor-leader': 12, 'state-official': 13
}

In [18]:
y = batch_1['party_affiliation'].replace(to_replace)

In [19]:
train_features, test_features, train_labels, test_labels = train_test_split(features, y)

In [26]:
baseline = RandomForestClassifier(max_depth = 30)

In [27]:
baseline.fit(train_features, train_labels)

RandomForestClassifier(max_depth=30)

In [28]:
baseline_train_preds = baseline.predict(train_features)
(baseline_train_preds == train_labels).mean()

1.0

In [29]:
(baseline.predict(test_features) == test_labels).mean()

0.448

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
param_grid = {
    'n_estimators': [10, 50, 100], 
    'max_depth': [None, 10, 20, 30],   
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4], 
    'max_features': ['auto', 'sqrt', 'log2'], 
}

In [33]:
rf_classifier = RandomForestClassifier(random_state=42)
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(train_features, train_labels)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits




[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   1.3s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   0.7s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.7s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=   0.6s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, max_features=auto, min_samples_leaf=2, min_samples_split=2, n_estimators=10; total time=   0.1s
[CV] END max_depth=None, max

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 10, 20, 30],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [10, 50, 100]},
             verbose=2)

In [34]:
best_estimator = grid_search.best_estimator_
baseline_pred = best_estimator.predict(train_features)


In [35]:
(baseline_pred == train_labels).mean()

0.9853333333333333

In [152]:
best_estimator

RandomForestClassifier(max_depth=10, max_features='log2', random_state=42)

In [37]:
(best_estimator.predict(test_features) == test_labels).mean()

0.436

In [130]:
f1_score(best_estimator.predict(test_features), test_labels, average = 'weighted')

0.5329064617251491

**LSTM**

In [38]:
import tensorflow as tf

2023-10-26 12:24:09.156664: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, MaxPooling1D, Embedding, Flatten, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [40]:
train_features.shape

(750, 768)

In [41]:
test_features.shape

(250, 768)

In [42]:
len(batch_1['party_affiliation'].unique())

13

In [43]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [44]:
train_features = train_features.reshape(train_features.shape[0], train_features.shape[1], 1)

In [83]:
from keras.utils import to_categorical

train_labels_onehot = to_categorical(train_labels, num_classes=14)
test_labels_onehot = to_categorical(test_labels, num_classes=14)

In [182]:
with tf.device('/GPU:0'):

    model = Sequential([
        LSTM(256, return_sequences=True, input_shape=(train_features.shape[1], 1)),
        Flatten(),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(14, activation='softmax')
        
        
    ])


#model.add(Dense(128, activation='relu', input_dim=train_features.shape[1])) # Fully Connected Layer

#model.add(LSTM(128, return_sequences=True, input_shape=(train_features.shape[1], 1)))  # LSTM Layer

# model.add(MaxPooling1D(pool_size=2))
# model.add(Flatten())


# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))

# model.add(Dense(14, activation='relu')) # Output Layer, 13 parties 

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [183]:
#lr = 0.01, 128 neurons with lstm layer and only one dense output gives 41% val acc --> predicting 1 for everything

In [185]:
history = model.fit(train_features, train_labels_onehot, validation_data=(test_features, test_labels_onehot),
                    epochs=8, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(test_features, test_labels_onehot)
print(f"Validation loss: {loss}, Validation accuracy: {accuracy}")

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
Validation loss: 1.3411492109298706, Validation accuracy: 0.4399999976158142


In [190]:
lstm_preds = np.argmax(model.predict(test_features), axis = -1)



In [191]:
f1_score(lstm_preds, test_labels, average = 'weighted')

0.5423094924588971

In [162]:
test_labels.value_counts() / test_labels.value_counts().sum()

party_affiliation
1    0.416
2    0.340
3    0.188
4    0.020
5    0.020
6    0.008
9    0.004
7    0.004
Name: count, dtype: float64

**BI-LSTM**

In [198]:
bilstm_model = Sequential([
        tf.keras.layers.Bidirectional(LSTM(256, return_sequences=True), input_shape=(train_features.shape[1], 1)),
        Flatten(),
        Dense(256, activation='relu'),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(14, activation='softmax')
    ])

# Compile the model
bilstm_model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [199]:
history = bilstm_model.fit(train_features, train_labels_onehot, validation_data=(test_features, test_labels_onehot),
                    epochs=20, batch_size=32)

# Evaluate the model
loss, accuracy = bilstm_model.evaluate(test_features, test_labels_onehot)
print(f"Validation loss: {loss}, Validation accuracy: {accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation loss: 1.3320956230163574, Validation accuracy: 0.42800000309944153


In [203]:
bilstm_preds = np.argmax(bilstm_model.predict(test_features), axis = -1)
f1_score(bilstm_preds, test_labels, average = 'weighted')



0.4969508480696088

**Simple MLP**

In [104]:
train_features.shape[1]

768

In [158]:
mlp = Sequential([
    Dense(128, activation='relu', input_shape=(train_features.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(14, activation='softmax')
    
])

mlp.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [159]:
history = mlp.fit(train_features, train_labels_onehot, validation_data=(test_features, test_labels_onehot),
                    epochs=15, batch_size=32)

# Evaluate the model
loss, accuracy = mlp.evaluate(test_features, test_labels_onehot)
print(f"Validation loss: {loss}, Validation accuracy: {accuracy}")

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Validation loss: 1.2791374921798706, Validation accuracy: 0.4359999895095825


In [160]:
mlp_predictions = np.argmax(mlp.predict(test_features), axis = -1)



In [162]:
mlp_predictions

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1])

In [161]:
f1_score(mlp_predictions, test_labels, average = 'weighted')

0.5754704167381238

**Wider MLP**

In [135]:
wide_mlp = Sequential()
wide_mlp.add(Dense(512, activation='relu', input_shape=(train_features.shape[1],)))
wide_mlp.add(Dropout(0.5))
wide_mlp.add(Dense(256, activation='relu'))
wide_mlp.add(Dropout(0.5))
wide_mlp.add(Dense(128, activation='relu'))
wide_mlp.add(Dropout(0.5))
wide_mlp.add(Dense(14, activation='softmax'))

wide_mlp.compile(optimizer=Adam(learning_rate=0.01), loss='categorical_crossentropy', metrics=['accuracy'])

In [137]:
history = wide_mlp.fit(train_features, train_labels_onehot, validation_data=(test_features, test_labels_onehot),
                    epochs=20, batch_size=64)

# Evaluate the model
loss, accuracy = mlp.evaluate(test_features, test_labels_onehot)
print(f"Validation loss: {loss}, Validation accuracy: {accuracy}")

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Validation loss: 1.2850334644317627, Validation accuracy: 0.4359999895095825


In [138]:
f1_score(
    np.argmax(wide_mlp.predict(test_features),
              axis = -1), test_labels, average = 'weighted'
)




0.5955056179775281

**old**

In [2]:
model_name = "bert-base-uncased"  # change model
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
def extract_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Average over tokens to get sentence-level representation
    return embeddings

In [15]:
#df['bert_embeddings'] = df['statement'].apply(extract_bert_embeddings)


In [None]:
features_df = pd.concat([pd.DataFrame(tfidf_features.toarray()),
                         df['sentiment_scores']], axis=1)

**Fitting baseline model**

In [36]:
X = features_df.to_numpy()
y = df['political bias']

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [63]:
baseline = RandomForestClassifier(max_depth = 30)

In [64]:
baseline.fit(X, y)

In [65]:
baseline_train_preds = baseline.predict(X)

In [66]:
(baseline_train_preds == y).mean()

0.70087890625

**Evaluating baseline**

In [67]:
df_val = pd.read_csv('valid.tsv', delimiter='\t', header = None)
df_val.rename({0: 'id', 1: 'label', 2: 'statement', 3: 'subject', 4: 'speaker', 5: 'job-title',
           6: 'state_info', 7: 'party_affiliation', 8: 'barely_true_counts', 9: 'false_counts',
           10: 'half_true_counts', 11: 'mostly_true_counts', 12: 'pants_on_fire_counts', 13: 'context'
          }, axis = 1, inplace = True)

df_val['political bias'] = df_val.apply(make_bias, axis = 1)

In [68]:
tfidf_features_val = tfidf_vectorizer.transform(df_val['statement'])

df_val['sentiment_scores'] = df_val['statement'].apply(calculate_sentiment)

features_df_val = pd.concat([pd.DataFrame(tfidf_features_val.toarray()),
                         df_val['sentiment_scores']], axis=1)

In [69]:
X_val = features_df_val.to_numpy()
y_val = df_val['political bias']

In [70]:
baseline_preds = baseline.predict(X_val)
(baseline_preds == y_val).mean()

0.1394080996884735

________