In [1]:
from main import DataImporter
from models.modelhelper import ModelHelper
import numpy as np

data_importer = DataImporter()
data_importer.import_data()

ds_train = data_importer.get_train_data()
ds_test = data_importer.get_test_data()
ds_validation = data_importer.get_validation_data()

In [2]:
ds_train.info()
print('\n')
ds_test.info()
print('\n')
ds_validation.info()
print('\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10269 entries, 0 to 10268
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10269 non-null  object 
 1   label                 10269 non-null  int64  
 2   statement             10269 non-null  object 
 3   subject               10269 non-null  object 
 4   speaker               10269 non-null  object 
 5   job_title             10269 non-null  object 
 6   state_info            10269 non-null  object 
 7   party_affiliation     10269 non-null  object 
 8   barely_true_counts    10269 non-null  float32
 9   false_counts          10269 non-null  float32
 10  half_true_counts      10269 non-null  float32
 11  mostly_true_counts    10269 non-null  float32
 12  pants_on_fire_counts  10269 non-null  float32
 13  context               10269 non-null  object 
dtypes: float32(5), int64(1), object(8)
memory usage: 922.7+ KB


<class 'p

In [None]:
# Drop specified columns from each dataset
columns_to_drop = ['id',  'job_title', 'state_info']

ds_train = ds_train.drop(columns=columns_to_drop)
ds_test = ds_test.drop(columns=columns_to_drop) 
ds_validation = ds_validation.drop(columns=columns_to_drop)

# Display results
print("Training Dataset:")
display(ds_train.head())
print("\nTest Dataset:") 
display(ds_test.head())
print("\nValidation Dataset:")
display(ds_validation.head())

Training Dataset:


Unnamed: 0,label,statement,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts
0,0,Says the Annies List political group supports ...,0.0,1.0,0.0,0.0,0.0
1,1,When did the decline of coal start? It started...,0.0,0.0,1.0,1.0,0.0
2,2,"Hillary Clinton agrees with John McCain ""by vo...",70.0,71.0,160.0,163.0,9.0
3,0,Health care reform legislation is likely to ma...,7.0,19.0,3.0,5.0,44.0
4,1,The economic turnaround started at the end of ...,15.0,9.0,20.0,19.0,2.0



Test Dataset:


Unnamed: 0,label,statement,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts
0,3,Building a wall on the U.S.-Mexico border will...,30.0,30.0,42.0,23.0,18.0
1,0,Wisconsin is on pace to double the number of l...,2.0,1.0,0.0,0.0,0.0
2,0,Says John McCain has done nothing to help the ...,63.0,114.0,51.0,37.0,61.0
3,1,Suzanne Bonamici supports a plan that will cut...,1.0,1.0,3.0,1.0,1.0
4,5,When asked by a reporter whether hes at the ce...,5.0,7.0,2.0,2.0,7.0



Validation Dataset:


Unnamed: 0,label,statement,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts
0,4,We have less Americans working now than in the...,1.0,0.0,1.0,0.0,0.0
1,5,"When Obama was sworn into office, he DID NOT u...",11.0,43.0,8.0,5.0,105.0
2,0,Says Having organizations parading as being so...,0.0,1.0,1.0,1.0,0.0
3,1,Says nearly half of Oregons children are poor.,0.0,1.0,1.0,1.0,0.0
4,1,On attacks by Republicans that various program...,70.0,71.0,160.0,163.0,9.0


In [6]:
# Find longest statement
longest_statement = ds_train.loc[ds_train['statement'].str.len().idxmax(), 'statement']
# Get the labels for the longest statement
longest_statement_label = ds_train.loc[ds_train['statement'].str.len().idxmax(), 'label']

# Get word count by splitting on whitespace and counting tokens
word_count = len(longest_statement.split())


print(f"Longest statement ({len(longest_statement)} characters):")
print(longest_statement)
print(f'Longest statement word count: {word_count}')
print(f"Longest statement label: {longest_statement_label}")

Longest statement (395 characters):
Barbara Buono by the numbers: As a Trenton politician, she voted to raise taxes 154 times. Under her, property taxes up 70 percent. Backed a 16 percent sales tax increase. Utilities, nursing homes, cell phones, parking lots, lottery wins, gyms She taxed them all. Architect of Corzines budget, she drove New Jersey $2 billion into debt. Barbara Buono by the numbers: taking New Jersey backwards.
Longest statement word count: 65
Longest statement label: 2


# Getting Data and Model ready
## Load model_helper

In [7]:
# Initialize ModelHelper
model_helper = ModelHelper()

## Create Vectorizor and start preparing data

In [8]:
#Create Vectorizer
model_helper.create_vectorizer(ds_train['statement'], max_sequence_length=60)

# Preprocess text data
train_sequences = model_helper.preprocess_text(ds_train['statement'].tolist())
test_sequences = model_helper.preprocess_text(ds_test['statement'].tolist())
val_sequences = model_helper.preprocess_text(ds_validation['statement'].tolist())

## Get Sequence Columns

In [None]:
# Get truthfulness columns
truthfulness_columns = model_helper.truthfulness_columns

## Normalizing data

In [10]:
# Get the raw count values for training
train_labels = model_helper.normalize_counts(ds_train)
test_labels = model_helper.normalize_counts(ds_test)
val_labels = model_helper.normalize_counts(ds_validation)

## Create classification model

In [11]:
# Create text classification model
vocab_size = 10000  # Matches max_tokens in preprocess_text
embedding_dim = 100
max_sequence_length = 200
num_classes = len(truthfulness_columns)  # Number of truthfulness categories

model = model_helper.create_text_classification_model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim, 
    max_sequence_length=max_sequence_length,
    num_classes=num_classes
)

## Prepare Datasets

In [12]:
# Prepare datasets
train_dataset, val_dataset, test_dataset = model_helper.prepare_datasets(
    train_sequences=train_sequences,
    train_labels=train_labels,
    val_sequences=val_sequences,
    val_labels=val_labels,
    test_sequences=test_sequences,
    test_labels=test_labels,
    batch_size=32
)

## Training the model

In [13]:
%load_ext tensorboard
%tensorboard --logdir models/logs/fit

# Train the model using ModelHelper's train_model method
history = model_helper.train_model(
    model=model,
    train_data=train_dataset,
    validation_data=val_dataset,
    epochs=15,
    batch_size=32
)

Reusing TensorBoard on port 6006 (pid 16988), started 20:54:42 ago. (Use '!kill 16988' to kill it.)

Epoch 1/15
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 183ms/step - accuracy: 0.2695 - categorical_accuracy: 0.2695 - loss: 2.3390 - precision: 0.7463 - recall: 0.1336 - val_accuracy: 0.2508 - val_categorical_accuracy: 0.2508 - val_loss: 1.8003 - val_precision: 0.7442 - val_recall: 0.0083 - learning_rate: 0.0010
Epoch 2/15
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 182ms/step - accuracy: 0.2844 - categorical_accuracy: 0.2844 - loss: 1.7846 - precision: 0.7836 - recall: 0.0163 - val_accuracy: 0.3715 - val_categorical_accuracy: 0.3715 - val_loss: 1.6742 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 0.0010
Epoch 3/15
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 181ms/step - accuracy: 0.3208 - categorical_accuracy: 0.3208 - loss: 1.6675 - precision: 0.8002 - recall: 3.3350e-04 - val_accuracy: 0.3692 - val_categorical_accuracy: 0.3692 - val_loss: 1.6491 - val_precision: 0.0000e+00 - val_recall: 0.00

## Evaluate Model

In [14]:
# Evaluate on test set
test_metrics = model.evaluate(test_dataset)
print(test_metrics)

[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 62ms/step - accuracy: 0.3959 - categorical_accuracy: 0.3959 - loss: 1.6385 - precision: 0.0000e+00 - recall: 0.0000e+00
[1.6480544805526733, 0.3764614164829254, 0.3764614164829254, 0.0, 0.0]


## Save model

In [None]:
# Save the model
model_helper.save_model(model, "text_classification_model")

  return saving_lib.save_model(model, filepath)


True

## Load and Use Saved model

In [None]:
# Load the saved model
loaded_model = model_helper.load_model("text_classification_model")
test_string = 'Barbara Buono by the numbers: As a Trenton politician, she voted to raise taxes 154 times. Under her, property taxes up 70 percent. Backed a 16 percent sales tax increase. Utilities, nursing homes, cell phones, parking lots, lottery wins, gyms She taxed them all. Architect of Corzines budget, she drove New Jersey $2 billion into debt. Barbara Buono by the numbers: taking New Jersey backwards.'
print(model_helper.preprocess_text(test_string))
prediction = loaded_model.predict(model_helper.preprocess_text(test_string))
print(prediction)

tf.Tensor(
[[1.940e+03 3.843e+03 2.100e+01 2.000e+00 5.150e+02 2.600e+01 6.000e+00
  3.372e+03 1.608e+03 2.150e+02 7.300e+01 5.000e+00 2.100e+02 6.900e+01
  1.330e+02 7.000e+01 2.260e+02 3.360e+02 6.900e+01 8.000e+01 1.500e+01
  1.670e+03 6.000e+00 1.500e+01 4.420e+02 2.800e+01 1.620e+02 2.909e+03
  1.618e+03 8.130e+02 1.934e+03 2.666e+03 5.523e+03 2.204e+03 2.022e+03
  2.891e+03 1.000e+00 2.150e+02 2.126e+03 1.690e+02 6.500e+01 6.610e+03
  4.000e+00 1.000e+00 8.200e+01 2.150e+02 2.062e+03 5.400e+01 3.380e+02
  6.200e+01 1.010e+02 1.190e+02 1.940e+03 3.843e+03 2.100e+01 2.000e+00
  5.150e+02 4.990e+02 5.400e+01 3.380e+02]], shape=(1, 60), dtype=float32)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 607ms/step
[[0.1984394  0.37139696 0.2557743  0.17438938]]


## Use prepared test data to get Sample Predictions

In [17]:
# Make predictions on test dataset
print(test_dataset)
predictions = loaded_model.predict(test_dataset)
# Convert predictions to class labels
predicted_classes = np.argmax(predictions, axis=1)

# Print sample predictions
print("\nSample predictions:")
for i in range(5):
    print(f"Example {i+1}:")
    print(f"Predicted probabilities: {predictions[i]}")
    print(f"Predicted class: {predicted_classes[i]}")
    print(f"Actual class: {np.argmax(test_labels[i])}\n")


<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 60), dtype=tf.float32, name=None), TensorSpec(shape=(None, 4), dtype=tf.float32, name=None))>
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 63ms/step

Sample predictions:
Example 1:
Predicted probabilities: [0.24489218 0.35927385 0.20038381 0.19545014]
Predicted class: 1
Actual class: 1

Example 2:
Predicted probabilities: [0.24471992 0.36011055 0.20068415 0.1944854 ]
Predicted class: 1
Actual class: 0

Example 3:
Predicted probabilities: [0.24017778 0.36065787 0.20118512 0.19797912]
Predicted class: 1
Actual class: 1

Example 4:
Predicted probabilities: [0.24273504 0.3594236  0.20100954 0.19683182]
Predicted class: 1
Actual class: 2

Example 5:
Predicted probabilities: [0.2552857  0.35438573 0.19070457 0.19962399]
Predicted class: 1
Actual class: 1



In [18]:
models_list = model_helper.list_models()
print(models_list)

['text_classification_model', 'text_classification_model_fourcol', 'text_classification_model_fourcol_b', 'text_classification_model_threecol', 'text_classification_model_threecolB']
