In [1]:
from main import DataImporter
from models.modelhelper import ModelHelper
import numpy as np

data_importer = DataImporter()
data_importer.import_data()

ds_train = data_importer.get_train_data()
ds_test = data_importer.get_test_data()
ds_validation = data_importer.get_validation_data()

In [2]:
ds_train.info()
print('\n')
ds_test.info()
print('\n')
ds_validation.info()
print('\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10269 entries, 0 to 10268
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10269 non-null  object 
 1   label                 10269 non-null  int64  
 2   statement             10269 non-null  object 
 3   subject               10269 non-null  object 
 4   speaker               10269 non-null  object 
 5   job_title             10269 non-null  object 
 6   state_info            10269 non-null  object 
 7   party_affiliation     10269 non-null  object 
 8   barely_true_counts    10269 non-null  float32
 9   false_counts          10269 non-null  float32
 10  half_true_counts      10269 non-null  float32
 11  mostly_true_counts    10269 non-null  float32
 12  pants_on_fire_counts  10269 non-null  float32
 13  context               10269 non-null  object 
dtypes: float32(5), int64(1), object(8)
memory usage: 922.7+ KB


<class 'p

In [3]:
# Drop specified columns from each dataset
columns_to_drop = ['id',  'job_title', 'state_info']

ds_train = ds_train.drop(columns=columns_to_drop)
ds_test = ds_test.drop(columns=columns_to_drop) 
ds_validation = ds_validation.drop(columns=columns_to_drop)

# Display results
print("Training Dataset:")
display(ds_train.head())
print("\nTest Dataset:") 
display(ds_test.head())
print("\nValidation Dataset:")
display(ds_validation.head())


Training Dataset:


Unnamed: 0,label,statement,subject,speaker,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,0,Health care reform legislation is likely to ma...,health-care,blog-posting,none,7.0,19.0,3.0,5.0,44.0,a news release
4,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN



Test Dataset:


Unnamed: 0,label,statement,subject,speaker,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,3,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,republican,30.0,30.0,42.0,23.0,18.0,Radio interview
1,0,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,democrat,2.0,1.0,0.0,0.0,0.0,a news conference
2,0,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,republican,63.0,114.0,51.0,37.0,61.0,comments on ABC's This Week.
3,1,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,republican,1.0,1.0,3.0,1.0,1.0,a radio show
4,5,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,democrat,5.0,7.0,2.0,2.0,7.0,a web video



Validation Dataset:


Unnamed: 0,label,statement,subject,speaker,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts,context
0,4,We have less Americans working now than in the...,"economy,jobs",vicky-hartzler,republican,1.0,0.0,1.0,0.0,0.0,an interview with ABC17 News
1,5,"When Obama was sworn into office, he DID NOT u...","obama-birth-certificate,religion",chain-email,none,11.0,43.0,8.0,5.0,105.0,
2,0,Says Having organizations parading as being so...,"campaign-finance,congress,taxes",earl-blumenauer,democrat,0.0,1.0,1.0,1.0,0.0,a U.S. Ways and Means hearing
3,1,Says nearly half of Oregons children are poor.,poverty,jim-francesconi,none,0.0,1.0,1.0,1.0,0.0,an opinion article
4,1,On attacks by Republicans that various program...,"economy,stimulus",barack-obama,democrat,70.0,71.0,160.0,163.0,9.0,interview with CBS News


# Getting Data and Model ready
## Preparing the data and loading model_helper

In [4]:
# Initialize ModelHelper
model_helper = ModelHelper()

# Preprocess text data
train_texts = ds_train['statement'].apply(model_helper.preprocess_text)
test_texts = ds_test['statement'].apply(model_helper.preprocess_text)
val_texts = ds_validation['statement'].apply(model_helper.preprocess_text)

# Convert texts to sequences
train_sequences = model_helper.preprocess_text(train_texts)
test_sequences = model_helper.preprocess_text(test_texts)
val_sequences = model_helper.preprocess_text(val_texts)

# Get truthfulness columns
truthfulness_columns = model_helper.truthfulness_columns


## Normalizing data

In [5]:
# Get the raw count values for training
train_labels = model_helper.normalize_counts(ds_train)
test_labels = model_helper.normalize_counts(ds_test)
val_labels = model_helper.normalize_counts(ds_validation)

## Create classification model

In [6]:
# Create text classification model
vocab_size = 10000  # Matches max_tokens in preprocess_text
embedding_dim = 100
max_sequence_length = 200
num_classes = len(truthfulness_columns)  # Number of truthfulness categories

model = model_helper.create_text_classification_model(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim, 
    max_sequence_length=max_sequence_length,
    num_classes=num_classes
)

## Prepare Datasets

In [7]:
# Prepare datasets
train_dataset, val_dataset, test_dataset = model_helper.prepare_datasets(
    train_sequences=train_sequences,
    train_labels=train_labels,
    val_sequences=val_sequences,
    val_labels=val_labels,
    test_sequences=test_sequences,
    test_labels=test_labels,
    batch_size=32
)

## Training the model

In [8]:
%load_ext tensorboard
%tensorboard --logdir models/logs/fit

# Train the model using ModelHelper's train_model method
history = model_helper.train_model(
    model=model,
    train_data=train_dataset,
    validation_data=val_dataset,
    epochs=15,
    batch_size=32
)

Reusing TensorBoard on port 6006 (pid 14640), started 3 days, 23:49:02 ago. (Use '!kill 14640' to kill it.)

Epoch 1/15
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 255ms/step - accuracy: 0.1976 - categorical_accuracy: 0.1976 - loss: 2.5312 - precision: 0.6922 - recall: 0.0836 - val_accuracy: 0.2936 - val_categorical_accuracy: 0.2936 - val_loss: 1.8853 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 0.0010
Epoch 2/15
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 267ms/step - accuracy: 0.2314 - categorical_accuracy: 0.2314 - loss: 1.9240 - precision: 0.7182 - recall: 0.0088 - val_accuracy: 0.2157 - val_categorical_accuracy: 0.2157 - val_loss: 1.8286 - val_precision: 0.0000e+00 - val_recall: 0.0000e+00 - learning_rate: 0.0010
Epoch 3/15
[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 274ms/step - accuracy: 0.2363 - categorical_accuracy: 0.2363 - loss: 1.8211 - precision: 0.8478 - recall: 5.2802e-05 - val_accuracy: 0.2157 - val_categorical_accuracy: 0.2157 - val_loss: 1.7906 - val_precision: 0.0000e+00 - val_reca

## Evaluate Model

In [9]:
# Evaluate on test set
test_metrics = model.evaluate(test_dataset)
print(test_metrics)

[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 106ms/step - accuracy: 0.2133 - categorical_accuracy: 0.2133 - loss: 1.6246 - precision: 0.0000e+00 - recall: 0.0000e+00
[1.6228994131088257, 0.21901792287826538, 0.21901792287826538, 0.0, 0.0]


## Save model

In [10]:
# Save the model
model_helper.save_model(model, "text_classification_model")

True

## Load and Use Saved model

In [11]:
# Load the saved model
loaded_model = model_helper.load_model("text_classification_model.keras")
test_string = 'The president is a good man'
prediction = loaded_model.predict(model_helper.preprocess_text(test_string))
print(prediction)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 424ms/step
[[0.18289709 0.0951431  0.24033666 0.2468163  0.23480688]]


## Use prepared test data to get Sample Predictions

In [12]:
# Make predictions on test dataset
print(test_dataset)
predictions = loaded_model.predict(test_dataset)
# Convert predictions to class labels
predicted_classes = np.argmax(predictions, axis=1)

# Print sample predictions
print("\nSample predictions:")
for i in range(5):
    print(f"Example {i+1}:")
    print(f"Predicted probabilities: {predictions[i]}")
    print(f"Predicted class: {predicted_classes[i]}")
    print(f"Actual class: {np.argmax(test_labels[i])}\n")


<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 200), dtype=tf.float32, name=None), TensorSpec(shape=(None, 5), dtype=tf.float32, name=None))>
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 108ms/step

Sample predictions:
Example 1:
Predicted probabilities: [0.18290547 0.09526586 0.2402285  0.24685962 0.2347406 ]
Predicted class: 3
Actual class: 3

Example 2:
Predicted probabilities: [0.18290547 0.09526586 0.2402285  0.24685962 0.2347406 ]
Predicted class: 3
Actual class: 0

Example 3:
Predicted probabilities: [0.18290547 0.09526586 0.2402285  0.24685962 0.2347406 ]
Predicted class: 3
Actual class: 4

Example 4:
Predicted probabilities: [0.18290547 0.09526586 0.2402285  0.24685962 0.2347406 ]
Predicted class: 3
Actual class: 3

Example 5:
Predicted probabilities: [0.18290547 0.09526586 0.2402285  0.24685962 0.2347406 ]
Predicted class: 3
Actual class: 1



In [13]:
models_list = model_helper.list_models()
print(models_list)

['logs', '__pycache__']
