In [1]:
from main import DataImporter
from models.modelhelper import ModelHelper
import numpy as np

data_importer = DataImporter()
data_importer.import_data()

ds_train = data_importer.get_train_data()
ds_test = data_importer.get_test_data()
ds_validation = data_importer.get_validation_data()




In [2]:
ds_train.info()
print('\n')
ds_test.info()
print('\n')
ds_validation.info()
print('\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10269 entries, 0 to 10268
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    10269 non-null  object 
 1   label                 10269 non-null  int64  
 2   statement             10269 non-null  object 
 3   subject               10269 non-null  object 
 4   speaker               10269 non-null  object 
 5   job_title             10269 non-null  object 
 6   state_info            10269 non-null  object 
 7   party_affiliation     10269 non-null  object 
 8   barely_true_counts    10269 non-null  float32
 9   false_counts          10269 non-null  float32
 10  half_true_counts      10269 non-null  float32
 11  mostly_true_counts    10269 non-null  float32
 12  pants_on_fire_counts  10269 non-null  float32
 13  context               10269 non-null  object 
dtypes: float32(5), int64(1), object(8)
memory usage: 922.7+ KB


<class 'p

In [3]:
# Drop specified columns from each dataset
columns_to_drop = ['id',  'job_title', 'state_info', 'context']

ds_train = ds_train.drop(columns=columns_to_drop)
ds_test = ds_test.drop(columns=columns_to_drop) 
ds_validation = ds_validation.drop(columns=columns_to_drop)

# Display results
print("Training Dataset:")
display(ds_train.head())
print("\nTest Dataset:") 
display(ds_test.head())
print("\nValidation Dataset:")
display(ds_validation.head())

Training Dataset:


Unnamed: 0,label,statement,subject,speaker,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts
0,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,republican,0.0,1.0,0.0,0.0,0.0
1,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,democrat,0.0,0.0,1.0,1.0,0.0
2,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,democrat,70.0,71.0,160.0,163.0,9.0
3,0,Health care reform legislation is likely to ma...,health-care,blog-posting,none,7.0,19.0,3.0,5.0,44.0
4,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,democrat,15.0,9.0,20.0,19.0,2.0



Test Dataset:


Unnamed: 0,label,statement,subject,speaker,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts
0,3,Building a wall on the U.S.-Mexico border will...,immigration,rick-perry,republican,30.0,30.0,42.0,23.0,18.0
1,0,Wisconsin is on pace to double the number of l...,jobs,katrina-shankland,democrat,2.0,1.0,0.0,0.0,0.0
2,0,Says John McCain has done nothing to help the ...,"military,veterans,voting-record",donald-trump,republican,63.0,114.0,51.0,37.0,61.0
3,1,Suzanne Bonamici supports a plan that will cut...,"medicare,message-machine-2012,campaign-adverti...",rob-cornilles,republican,1.0,1.0,3.0,1.0,1.0
4,5,When asked by a reporter whether hes at the ce...,"campaign-finance,legal-issues,campaign-adverti...",state-democratic-party-wisconsin,democrat,5.0,7.0,2.0,2.0,7.0



Validation Dataset:


Unnamed: 0,label,statement,subject,speaker,party_affiliation,barely_true_counts,false_counts,half_true_counts,mostly_true_counts,pants_on_fire_counts
0,4,We have less Americans working now than in the...,"economy,jobs",vicky-hartzler,republican,1.0,0.0,1.0,0.0,0.0
1,5,"When Obama was sworn into office, he DID NOT u...","obama-birth-certificate,religion",chain-email,none,11.0,43.0,8.0,5.0,105.0
2,0,Says Having organizations parading as being so...,"campaign-finance,congress,taxes",earl-blumenauer,democrat,0.0,1.0,1.0,1.0,0.0
3,1,Says nearly half of Oregons children are poor.,poverty,jim-francesconi,none,0.0,1.0,1.0,1.0,0.0
4,1,On attacks by Republicans that various program...,"economy,stimulus",barack-obama,democrat,70.0,71.0,160.0,163.0,9.0


In [4]:
# Find longest statement
longest_statement = ds_train.loc[ds_train['statement'].str.len().idxmax(), 'statement']
# Get the labels for the longest statement
longest_statement_label = ds_train.loc[ds_train['statement'].str.len().idxmax(), 'label']

# Get word count by splitting on whitespace and counting tokens
word_count = len(longest_statement.split())


print(f"Longest statement ({len(longest_statement)} characters):")
print(longest_statement)
print(f'Longest statement word count: {word_count}')
print(f"Longest statement label: {longest_statement_label}")

Longest statement (395 characters):
Barbara Buono by the numbers: As a Trenton politician, she voted to raise taxes 154 times. Under her, property taxes up 70 percent. Backed a 16 percent sales tax increase. Utilities, nursing homes, cell phones, parking lots, lottery wins, gyms She taxed them all. Architect of Corzines budget, she drove New Jersey $2 billion into debt. Barbara Buono by the numbers: taking New Jersey backwards.
Longest statement word count: 65
Longest statement label: 2


# Getting Data and Model ready
## Load model_helper

In [5]:
# Initialize ModelHelper
model_helper = ModelHelper()

## Create Vectorizor and start preparing data

In [6]:
#Create Vectorizer
model_helper.create_vectorizer(ds_train['statement'], max_sequence_length=60)

# Preprocess text data
train_sequences = model_helper.preprocess_text(ds_train['statement'].tolist())
test_sequences = model_helper.preprocess_text(ds_test['statement'].tolist())
val_sequences = model_helper.preprocess_text(ds_validation['statement'].tolist())





## Get Sequence Columns

In [7]:
# Get truthfulness columns
truthfulness_columns = model_helper.truthfulness_columns

## Normalizing data

In [8]:
# Get the raw count values for training
train_labels = model_helper.normalize_counts(ds_train)
test_labels = model_helper.normalize_counts(ds_test)
val_labels = model_helper.normalize_counts(ds_validation)

## Create classification model

In [10]:
# Create text classification model
vocab_size = 10000  # Matches max_tokens in preprocess_text
embedding_dim = 100
max_sequence_length = 200
num_classes = len(truthfulness_columns)  # Number of truthfulness categories

model = model_helper.create_text_classification_model(
    num_classes=num_classes
)

## Prepare Datasets

In [11]:
# Prepare datasets
train_dataset, val_dataset, test_dataset = model_helper.prepare_datasets(
    train_sequences=train_sequences,
    train_labels=train_labels,
    val_sequences=val_sequences,
    val_labels=val_labels,
    test_sequences=test_sequences,
    test_labels=test_labels,
    batch_size=32
)

## Training the model

In [16]:
%load_ext tensorboard
%tensorboard --logdir models/logs/fit

# Train the model using ModelHelper's train_model method
history = model_helper.train_model(
    model=model,
    train_data=train_dataset,
    validation_data=val_dataset,
    epochs=100,
    batch_size=32
)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 14640), started 4 days, 23:36:49 ago. (Use '!kill 14640' to kill it.)

Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
Epoch 75/150
Epoch 76/150
Epoch 77/150
Epoch 78

## Evaluate Model

In [13]:
# Evaluate on test set
test_metrics = model.evaluate(test_dataset)
print(test_metrics)

[2.284449577331543, 0.2813717722892761, 0.2813717722892761, 1.0, 0.00021992521942593157]


## Save model

In [14]:
# Save the model
model_helper.save_model(model, "text_classification_model")

Error saving model: Save or restore weights that is not an instance of `tf.Variable` is not supported in h5, use `save_format='tf'` instead. Received a model or layer TextVectorization with weights [<tf_keras.src.layers.preprocessing.index_lookup.VocabWeightHandler object at 0x00000210C9A95460>]


False

## Load and Use Saved model

In [15]:
# Load the saved model
loaded_model = model_helper.load_model("text_classification_model")
test_string = 'Barbara Buono by the numbers: As a Trenton politician, she voted to raise taxes 154 times. Under her, property taxes up 70 percent. Backed a 16 percent sales tax increase. Utilities, nursing homes, cell phones, parking lots, lottery wins, gyms She taxed them all. Architect of Corzines budget, she drove New Jersey $2 billion into debt. Barbara Buono by the numbers: taking New Jersey backwards.'
print(model_helper.preprocess_text(test_string))
prediction = loaded_model.predict(model_helper.preprocess_text(test_string))
print(prediction)

Error loading model: Save or restore weights that is not an instance of `tf.Variable` is not supported in h5, use `save_format='tf'` instead. Received a model or layer TextVectorization with weights [<tf_keras.src.layers.preprocessing.index_lookup.VocabWeightHandler object at 0x0000021085AE2EA0>]
[[ 7.80787924e-03 -1.44528849e-02  6.46416424e-03 -2.56793685e-02
  -9.65327397e-02  8.58551189e-02  6.98082447e-02  3.99074377e-03
  -8.74180868e-02  2.87988503e-02  2.87528969e-02  1.01409713e-02
   2.93969605e-02 -8.59811381e-02 -3.12866308e-02  7.26936832e-02
   4.58125258e-03  8.62116888e-02  2.10175058e-03  1.01940013e-01
   7.18423799e-02  4.75919712e-03 -9.46561049e-05  5.05877845e-02
   1.56101644e-01  4.36919257e-02 -1.51563315e-02  2.40388163e-03
  -1.58895310e-02 -2.51775812e-02 -1.06737919e-01  7.07412809e-02
   4.94894832e-02  3.08563840e-02  2.91129984e-02 -3.40982117e-02
   1.11657195e-01  4.70892824e-02 -6.71625230e-03  2.94581391e-02
  -4.25266698e-02 -3.54633946e-03 -1.84288

AttributeError: 'NoneType' object has no attribute 'predict'

## Use prepared test data to get Sample Predictions

In [15]:
# Make predictions on test dataset
print(test_dataset)
predictions = loaded_model.predict(test_dataset)
# Convert predictions to class labels
predicted_classes = np.argmax(predictions, axis=1)

# Print sample predictions
print("\nSample predictions:")
for i in range(5):
    print(f"Example {i+1}:")
    print(f"Predicted probabilities: {predictions[i]}")
    print(f"Predicted class: {predicted_classes[i]}")
    print(f"Actual class: {np.argmax(test_labels[i])}\n")


<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 60), dtype=tf.float32, name=None), TensorSpec(shape=(None, 5), dtype=tf.float32, name=None))>
[1m41/41[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 33ms/step

Sample predictions:
Example 1:
Predicted probabilities: [0.22542152 0.07708871 0.25169054 0.25748774 0.18831144]
Predicted class: 3
Actual class: 3

Example 2:
Predicted probabilities: [0.20950253 0.07118634 0.2756604  0.26801345 0.17563727]
Predicted class: 2
Actual class: 0

Example 3:
Predicted probabilities: [0.23635894 0.09474026 0.2315793  0.23723659 0.20008487]
Predicted class: 3
Actual class: 4

Example 4:
Predicted probabilities: [0.23215993 0.08755633 0.23721452 0.24035522 0.202714  ]
Predicted class: 3
Actual class: 3

Example 5:
Predicted probabilities: [0.21936993 0.06663408 0.2585142  0.26102304 0.1944587 ]
Predicted class: 3
Actual class: 1



In [16]:
models_list = model_helper.list_models()
print(models_list)

['text_classification_model']
