<a href="https://colab.research.google.com/github/mkoskinas/project-2-nlp/blob/main/project_2_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

# 1. Read and explore the data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re


In [3]:
## Read Data for the Twitter Fake News Challenge

data = pd.read_csv("data/training_data_lowercase.csv", encoding='latin-1', delimiter='\t', names=['label', 'tweet'], skiprows=1)
print(data.head())
print(data.shape)

   label                                              tweet
0      0  drunk bragging trump staffer started russian c...
1      0  sheriff david clarke becomes an internet joke ...
2      0  trump is so obsessed he even has obamaâs nam...
3      0  pope francis just called out donald trump duri...
4      0  racist alabama cops brutalize black boy while ...
(34151, 2)


In [4]:
# Sample a few tweets for inspection
sample_tweets = data['tweet'].sample(10)

# Print the length and full content of the sampled tweets
for tweet in sample_tweets:
    print(f"Length: {len(tweet)}, Content: {tweet}")

Length: 58, Content: u.s. makes lower trade deficit top priority in nafta talks
Length: 67, Content: justice department names new acting head of drug enforcement agency
Length: 46, Content: cbo posts review of republican healthcare plan
Length: 86, Content: fbi and doj refusing to comply with subpoena over trump dossierâhello jeff sessions?
Length: 72, Content: house dem wants gop on record: stop govât spending at trump properties
Length: 79, Content: sister of ny attack suspect says he may have been brainwashed; appeals to trump
Length: 85, Content: cambodia pm hun sen says 2018 election result does not need international recognition
Length: 80, Content: judges uphold bosnian croat convictions in last verdict of yugoslav war tribunal
Length: 83, Content: the purge: nyc mayor de blasio to review âall symbols of hateâ on city property
Length: 58, Content: house speaker tells trump healthcare bill lacks votes: cnn


In [5]:
# Display class distribution
print("Class Distribution:")
print(data['label'].value_counts(normalize=True))

# Basic dataset info
print("\nDataset Info:")
print(data.info())

# Check unique values in label column
print("\nUnique Labels:")
print(data['label'].unique())

Class Distribution:
label
0    0.514509
1    0.485491
Name: proportion, dtype: float64

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34151 entries, 0 to 34150
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   34151 non-null  int64 
 1   tweet   34151 non-null  object
dtypes: int64(1), object(1)
memory usage: 533.7+ KB
None

Unique Labels:
[0 1]


# 2. Divide the dataset training, validation, and test set



In [6]:
# First split: Create a holdout test set (20% of data)
train_val_data, test_data = train_test_split(
    data,
    test_size=0.2,  # 20% for final testing
    random_state=42,  # for reproducibility
    stratify=data['label']  # ensure balanced classes
)

# Second split: Create validation set from remaining data
train_data, val_data = train_test_split(
    train_val_data,
    test_size=0.2,  # 20% of remaining data (16% of original data)
    random_state=42,
    stratify=train_val_data['label']
)

# Print split sizes
print(f"Total samples: {len(data)}")
print(f"Training samples: {len(train_data)} ({len(train_data)/len(data):.1%})")
print(f"Validation samples: {len(val_data)} ({len(val_data)/len(data):.1%})")
print(f"Test samples: {len(test_data)} ({len(test_data)/len(data):.1%})")

# Save splits separately to prevent data leakage
train_data.to_csv('data/train_data.csv', index=False)
val_data.to_csv('data/validation_data.csv', index=False)
test_data.to_csv('data/test_data.csv', index=False)

Total samples: 34151
Training samples: 21856 (64.0%)
Validation samples: 5464 (16.0%)
Test samples: 6831 (20.0%)


In [7]:
# Check for missing values in the 'tweet' column
num_missing_tweets = val_data['tweet'].isnull().sum()

# Print the result
print(f"Number of missing tweets: {num_missing_tweets}")

Number of missing tweets: 0


# 3. Data preprocessing

In [8]:
# HTML Content Analysis
def check_html_presence(data, text_column='tweet'):
    """
    Analyzes presence of HTML content in the dataset
    """
    # Initialize counters
    html_stats = {
        'total_tweets': len(data),
        'contains_html_tags': 0,
        'contains_scripts': 0,
        'contains_styles': 0,
        'contains_comments': 0
    }

    # Sample tweets with HTML for inspection
    html_examples = []

    for text in data[text_column]:
        # Check for HTML tags
        if re.search(r'<[^>]+>', str(text)):
            html_stats['contains_html_tags'] += 1

            # Check for specific elements
            if re.search(r'<script[^>]*>', str(text)):
                html_stats['contains_scripts'] += 1
            if re.search(r'<style[^>]*>', str(text)):
                html_stats['contains_styles'] += 1
            if re.search(r'<!--', str(text)):
                html_stats['contains_comments'] += 1

            # Store example if it's one of first 5 found
            if len(html_examples) < 5:
                html_examples.append(text)

    # Calculate percentages
    for key in ['contains_html_tags', 'contains_scripts', 'contains_styles', 'contains_comments']:
        percentage = (html_stats[key] / html_stats['total_tweets']) * 100
        html_stats[f'{key}_percentage'] = f"{percentage:.2f}%"

    return {
        'stats': html_stats,
        'examples': html_examples
    }

# Run the analysis
html_analysis = check_html_presence(data)

# Print results
print("HTML Content Analysis:")
print("-" * 50)
print(f"Total tweets analyzed: {html_analysis['stats']['total_tweets']}")
print(f"Tweets containing HTML tags: {html_analysis['stats']['contains_html_tags']} ({html_analysis['stats']['contains_html_tags_percentage']})")
print(f"Tweets containing scripts: {html_analysis['stats']['contains_scripts']} ({html_analysis['stats']['contains_scripts_percentage']})")
print(f"Tweets containing styles: {html_analysis['stats']['contains_styles']} ({html_analysis['stats']['contains_styles_percentage']})")
print(f"Tweets containing comments: {html_analysis['stats']['contains_comments']} ({html_analysis['stats']['contains_comments_percentage']})")

if html_analysis['examples']:
    print("\nExample tweets containing HTML:")
    for i, example in enumerate(html_analysis['examples'], 1):
        print(f"\nExample {i}:")
        print(example[:200] + "..." if len(example) > 200 else example)
else:
    print("\nNo HTML content found in tweets")

HTML Content Analysis:
--------------------------------------------------
Total tweets analyzed: 34151
Tweets containing HTML tags: 0 (0.00%)
Tweets containing scripts: 0 (0.00%)
Tweets containing styles: 0 (0.00%)
Tweets containing comments: 0 (0.00%)

No HTML content found in tweets


In [15]:
# Tweet Elements Analysis
def analyze_tweet_elements(data, text_column='tweet'):
    """
    Analyzes presence of Twitter-specific elements in the dataset
    """
    stats = {
        'total_tweets': len(data),
        'contains_mentions': 0,
        'contains_hashtags': 0,
        'contains_urls': 0,
        'contains_rt': 0,
        'contains_emojis': 0,
        'contains_numbers': 0
    }

    # Store examples for each element
    examples = {k: [] for k in stats.keys() if k != 'total_tweets'}

    for text in data[text_column]:
        text = str(text)  # Ensure text is string

        # Check for mentions (@)
        if re.search(r'@\w+', text):
            stats['contains_mentions'] += 1
            if len(examples['contains_mentions']) < 3:
                examples['contains_mentions'].append(text)

        # Check for hashtags (#)
        if re.search(r'#\w+', text):
            stats['contains_hashtags'] += 1
            if len(examples['contains_hashtags']) < 3:
                examples['contains_hashtags'].append(text)

        # Check for URLs
        if re.search(r'http\S+|www\S+|https\S+', text):
            stats['contains_urls'] += 1
            if len(examples['contains_urls']) < 3:
                examples['contains_urls'].append(text)

        # Check for retweets
        if re.search(r'rt @\w+:', text, re.IGNORECASE):
            stats['contains_rt'] += 1
            if len(examples['contains_rt']) < 3:
                examples['contains_rt'].append(text)

        # Check for emojis (basic check)
        if re.search(r'[^\w\s.,!?\'\"@]', text):
            stats['contains_emojis'] += 1
            if len(examples['contains_emojis']) < 3:
                examples['contains_emojis'].append(text)

        # Check for numbers
        if re.search(r'\d+', text):
            stats['contains_numbers'] += 1
            if len(examples['contains_numbers']) < 3:
                examples['contains_numbers'].append(text)

    # Calculate percentages - Iterate over a copy of keys or use list comprehension
    # This avoids modifying the dictionary during iteration
    for key in list(stats.keys()):  # Or: for key in stats:
        if key != 'total_tweets':
            percentage = (stats[key] / stats['total_tweets']) * 100
            stats[f'{key}_percentage'] = f"{percentage:.2f}%"

    return {'stats': stats, 'examples': examples}

# Run the analysis
tweet_analysis = analyze_tweet_elements(data)

# Print results
print("Tweet Elements Analysis:")
print("-" * 50)
print(f"Total tweets analyzed: {tweet_analysis['stats']['total_tweets']}")
for key in tweet_analysis['stats'].keys():
    if key != 'total_tweets' and not key.endswith('_percentage'):
        print(f"\n{key.replace('_', ' ').title()}: {tweet_analysis['stats'][key]} ({tweet_analysis['stats'][key + '_percentage']})")
        if tweet_analysis['examples'][key]:
            print("Examples:")
            for i, example in enumerate(tweet_analysis['examples'][key], 1):
                print(f"{i}. {example}")

Tweet Elements Analysis:
--------------------------------------------------
Total tweets analyzed: 34151

Contains Mentions: 6 (0.02%)
Examples:
1. christian âprophetâ literally loses his @ss when he takes on wild lions for jesus
2. @ammon_bundyâs ridiculous late night twitter rant has everyone talking (tweets)
3. american workers scr@wed over by outsourcing jobs get their day in court

Contains Hashtags: 253 (0.74%)
Examples:
1. protesters welcome trump home to his golden tower with the best #resistance display yet (image)
2. #trumpchicken is now trending and these tweets are hilarious (images)
3. #bringbackobama hashtag blows up on twitter as americans share memories (tweets)

Contains Urls: 7 (0.02%)
Examples:
1. https://100percentfedup.com/served-roy-moore-vietnamletter-veteran-sets-record-straight-honorable-decent-respectable-patriotic-commander-soldier/
2. https://100percentfedup.com/video-hillary-asked-about-trump-i-just-want-to-eat-some-pie/
3. https://100percentfedup.com

In [17]:
# Perform the necessary cleaning based on the analysis

def clean_tweet(text):
    """
    Optimized cleaning function that:
    - Removes emojis and special characters
    - Preserves numbers (for fact checking)
    - Preserves basic punctuation
    - Handles hashtags
    """
    text = str(text)

    # Remove emojis and special characters but keep numbers and basic punctuation
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\'\"@]', ' ', text)

    # Handle hashtags
    text = re.sub(r'#(\w+)', r'\1', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text)

    return text.strip()

# Test the function with examples from the dataset
test_cases = [
    "sheriff david clarke becomes an internet joke for threatening to poke people âin the eyeâ",
    "cnn calls it: a democrat will represent alabama in the senate for the first time in 25 years",
    "john mccain wanted another 74 twitter followers"
]

print("Testing cleaning function:")
for i, test in enumerate(test_cases, 1):
    print(f"\nTest {i}:")
    print("Original:", test)
    print("Cleaned:", clean_tweet(test))

# Apply cleaning to each split separately
train_data['cleaned_tweet'] = train_data['tweet'].apply(clean_tweet)
val_data['cleaned_tweet'] = val_data['tweet'].apply(clean_tweet)
test_data['cleaned_tweet'] = test_data['tweet'].apply(clean_tweet)

# 3. Save the cleaned splits
train_data.to_csv('data/cleaned_train_data.csv', index=False)
val_data.to_csv('data/cleaned_validation_data.csv', index=False)
test_data.to_csv('data/cleaned_test_data.csv', index=False)

# 4. Print some statistics for verification
print("Dataset Split Sizes:")
print("-" * 50)
print(f"Training samples: {len(train_data)} ({len(train_data)/len(data):.1%})")
print(f"Validation samples: {len(val_data)} ({len(val_data)/len(data):.1%})")
print(f"Test samples: {len(test_data)} ({len(test_data)/len(data):.1%})")

# 5. Show examples from training set
print("\nTraining Set Cleaning Examples:")
print("-" * 50)
for i, (orig, cleaned) in enumerate(zip(train_data['tweet'].head(), train_data['cleaned_tweet'].head()), 1):
    print(f"\nExample {i}:")
    print(f"Original: {orig}")
    print(f"Cleaned:  {cleaned}")

Testing cleaning function:

Test 1:
Original: sheriff david clarke becomes an internet joke for threatening to poke people âin the eyeâ
Cleaned: sheriff david clarke becomes an internet joke for threatening to poke people in the eye

Test 2:
Original: cnn calls it: a democrat will represent alabama in the senate for the first time in 25 years
Cleaned: cnn calls it a democrat will represent alabama in the senate for the first time in 25 years

Test 3:
Original: john mccain wanted another 74 twitter followers
Cleaned: john mccain wanted another 74 twitter followers
Dataset Split Sizes:
--------------------------------------------------
Training samples: 21856 (64.0%)
Validation samples: 5464 (16.0%)
Test samples: 6831 (20.0%)

Training Set Cleaning Examples:
--------------------------------------------------

Example 1:
Original: bombshell report: nsa offered to give hillaryâs emails to fbiâjames comey rejected them
Cleaned:  bombshell report nsa offered to give hillary s emails to f

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import nltk
from sklearn.metrics import classification_report
import re

# 1. Load the data
train_data = pd.read_csv('data/cleaned_train_data.csv')
val_data = pd.read_csv('data/cleaned_validation_data.csv')

# 2. Create Model Pipeline
def create_fake_news_detector():
    # Use BERT model specifically fine-tuned for fake news
    model_name = "bert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Create text preprocessing layers
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessed_text = tf.keras.layers.Lambda(clean_tweet)(text_input)

    # Create tokenizer layer
    encoder = tokenizer(
        preprocessed_text,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='tf'
    )

    # BERT outputs
    outputs = model(encoder)

    # Add custom layers on top of BERT
    x = tf.keras.layers.Dense(64, activation='relu')(outputs.logits)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    # Create final model
    model = tf.keras.Model(inputs=[text_input], outputs=outputs)

    return model, tokenizer

# 3. Training Configuration
def train_model(model, X_train, y_train, X_val, y_val):
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
        loss='binary_crossentropy',
        metrics=['accuracy', tf.keras.metrics.AUC()]
    )

    # Add callbacks
    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        ),
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.2,
            patience=2
        )
    ]

    # Train
    history = model.fit(
        X_train,
        y_train,
        validation_data=(X_val, y_val),
        epochs=10,
        batch_size=16,
        callbacks=callbacks
    )

    return history

# 4. Evaluation and Analysis
def evaluate_model(model, X_test, y_test):
    # Get predictions
    y_pred = model.predict(X_test)
    y_pred_classes = (y_pred > 0.5).astype(int)

    # Print classification report
    print(classification_report(y_test, y_pred_classes))

    # Analyze errors
    errors = pd.DataFrame({
        'text': X_test[y_test != y_pred_classes],
        'true_label': y_test[y_test != y_pred_classes],
        'predicted_label': y_pred_classes[y_test != y_pred_classes],
        'confidence': y_pred[y_test != y_pred_classes]
    })

    return errors

# 5. Feature Importance Analysis
def analyze_feature_importance(model, tokenizer, text):
    # Use SHAP values or attention weights
    from transformers import pipeline
    explainer = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

    # Get token attributions
    attribution = explainer(text, return_tensors=True)

    return attribution

In [None]:
 from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf
import numpy as np

def preprocess_for_bert(text):
    """
    Minimal preprocessing for BERT
    """
    # Convert to string
    text = str(text)

    # Remove excessive whitespace
    text = ' '.join(text.split())

    # Remove URLs (optional, but often helpful)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)

    return text.strip()

def prepare_bert_data(texts, tokenizer, max_length=128):
    """
    Encode texts for BERT
    """
    return tokenizer(
        texts,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors='tf'
    )

# 1. Prepare the data
print("Preparing data...")
train_data['processed_text'] = train_data['text'].apply(preprocess_for_bert)
val_data['processed_text'] = val_data['text'].apply(preprocess_for_bert)
test_data['processed_text'] = test_data['text'].apply(preprocess_for_bert)

# 2. Initialize BERT tokenizer and model
print("Initializing BERT...")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = TFAutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

# 3. Encode data
print("Encoding data...")
train_encodings = prepare_bert_data(train_data['processed_text'].tolist(), tokenizer)
val_encodings = prepare_bert_data(val_data['processed_text'].tolist(), tokenizer)
test_encodings = prepare_bert_data(test_data['processed_text'].tolist(), tokenizer)

# 4. Prepare labels
y_train = train_data['label'].values
y_val = val_data['label'].values
y_test = test_data['label'].values

# 5. Configure training parameters
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = ['accuracy']

# 6. Compile model
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics
)

# 7. Setup callbacks
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True
    ),
    tf.keras.callbacks.ModelCheckpoint(
        'best_bert_model.h5',
        monitor='val_accuracy',
        save_best_only=True
    ),
    tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=1
    )
]

# 8. Train model
print("Training model...")
history = model.fit(
    train_encodings,
    y_train,
    validation_data=(val_encodings, y_val),
    epochs=3,  # Start with small number of epochs
    batch_size=16,  # Smaller batch size for better generalization
    callbacks=callbacks
)

# 9. Evaluate model
print("\nEvaluating model...")
test_results = model.evaluate(test_encodings, y_test)
print(f"Test accuracy: {test_results[1]:.4f}")

# 10. Save model and tokenizer
print("\nSaving model and tokenizer...")
model.save_pretrained('fake_news_bert_model')
tokenizer.save_pretrained('fake_news_bert_tokenizer')

# 11. Plot training history
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

# 12. Function for making predictions
def predict_fake_news(text, model=model, tokenizer=tokenizer):
    """
    Make prediction on new text
    """
    # Preprocess
    processed_text = preprocess_for_bert(text)

    # Encode
    encoding = prepare_bert_data([processed_text], tokenizer)

    # Predict
    prediction = model.predict(encoding)[0]
    probability = tf.sigmoid(prediction).numpy()[0]

    return {
        'text': text,
        'probability_fake': float(probability),
        'prediction': 'FAKE' if probability > 0.5 else 'REAL'
    }

# Test prediction function
sample_texts = [
    "Breaking: Scientists discover miracle cure for all diseases!",
    "New study shows correlation between exercise and health benefits"
]

print("\nTesting predictions:")
for text in sample_texts:
    result = predict_fake_news(text)
    print(f"\nText: {result['text']}")
    print(f"Prediction: {result['prediction']}")
    print(f"Probability of being fake: {result['probability_fake']:.2%}")