In [1]:
# Block 0: Importing Libraries

# Data Manipulation & Analysis
import pandas as pd
import numpy as np
from collections import Counter

# Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Text Processing & NLP
import re
import nltk
from nltk.corpus import stopwords

# Machine Learning & Model Evaluation
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Utilities
import joblib
import time
import warnings

# Configurations
warnings.filterwarnings('ignore')

# Download NLTK stopwords (only once)
nltk.download('stopwords')

# Visualization settings
plt.style.use('ggplot')
sns.set_palette("husl")

[nltk_data] Downloading package stopwords to /Users/nazb/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Block 1: Load and explore dataset

# Load dataset
df = pd.read_csv('/Users/nazb/VSCode101/project-nlp-challenge/dataset/data.csv')

# Display first few rows of the dataset
print(df.head())

# Display dataset general information
print( f'\n Number of columns and rows:' ,df.shape)
print (df.info())
print(f'\n Missing values per column:' ,df.isnull().sum())
# Display the distribution of classes
print(f'\n Class distribution:' ,df["label"].value_counts())
print(f'\n Class distribution percentage: ', df['label'].value_counts(normalize=True) * 100)
# Display the distribution of subjects
print(f'\n Subject distribution:' ,df["subject"].value_counts())
# Cross count of subject vs label
print(f"Cross count of subject and label:\n{df.groupby(['subject','label']).size().unstack(fill_value=0)}\n")
print(f"Percentage of fake/real per subject:\n{df.groupby(['subject','label']).size().unstack(fill_value=0).div(df.groupby(['subject','label']).size().unstack(fill_value=0).sum(axis=1), axis=0) * 100}")



   label                                              title  \
0      1  As U.S. budget fight looms, Republicans flip t...   
1      1  U.S. military to accept transgender recruits o...   
2      1  Senior U.S. Republican senator: 'Let Mr. Muell...   
3      1  FBI Russia probe helped by Australian diplomat...   
4      1  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  
0  December 31, 2017   
1  December 29, 2017   
2  December 31, 2017   
3  December 30, 2017   
4  December 29, 2017   

 Number of columns and rows: (39942, 5)
<class 'pandas.

In [3]:
# Block 2: Data preprocessing

# Initialize stopwords
stop_words = set(stopwords.words('english'))

# Concatenate title and text
df['content'] = df['title'] + ' ' + df['text']

# Function to find acronyms with dots (e.g., U.S., F.B.I.)
def find_dot_acronyms(df):
    all_text = ' '.join(df['content'].astype(str))
    pattern = r'\b(?:[A-Z]\.){2,}'  # two or more capital letters followed by dots
    acronyms_with_dots = re.findall(pattern, all_text)
    return set(acronyms_with_dots)

# Get acronyms found in the dataset
acronyms = find_dot_acronyms(df)

# Clean text function
def clean_text(text):
    # Replace acronyms with dots by their version without dots
    for abbr in acronyms:
        replacement = abbr.replace('.', '')
        text = text.replace(abbr, replacement)
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove all characters that are not letters or spaces
    text = re.sub(r'[^a-z\s]', ' ', text)
    
    # Remove stopwords
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

# Apply cleaning to the content column
df['clean_text'] = df['content'].apply(clean_text)


In [None]:
# Lemmatization Optional Part
from nltk.stem import WordNetLemmatizer
import nltk

# Descargar recursos (solo primera vez)
nltk.download('wordnet')
nltk.download('omw-1.4')

# Inicializar lemmatizer
lemmatizer = WordNetLemmatizer()

# Función de lematización simple
def lemmatize_text(text):
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

# Aplicar a tu texto ya limpio
print("Aplicando lematización...")
df['lemmatized_text'] = df['clean_text'].apply(lemmatize_text)

# Ver ejemplo
print("\nAntes:", df['clean_text'].iloc[0][:100])
print("Después:", df['lemmatized_text'].iloc[0][:100])

[nltk_data] Downloading package wordnet to /Users/nazb/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/nazb/nltk_data...


KeyboardInterrupt: 

In [4]:
# Check cleaning function
example_text = df.loc[2, 'content']
print("Original text:")
print(example_text)

# Apply cleaning
clean_example = clean_text(example_text)
print("\nCleaned text:")
print(clean_example)



Original text:
Senior U.S. Republican senator: 'Let Mr. Mueller do his job' WASHINGTON (Reuters) - The special counsel investigation of links between Russia and President Trump’s 2016 election campaign should continue without interference in 2018, despite calls from some Trump administration allies and Republican lawmakers to shut it down, a prominent Republican senator said on Sunday. Lindsey Graham, who serves on the Senate armed forces and judiciary committees, said Department of Justice Special Counsel Robert Mueller needs to carry on with his Russia investigation without political interference. “This investigation will go forward. It will be an investigation conducted without political influence,” Graham said on CBS’s Face the Nation news program. “And we all need to let Mr. Mueller do his job. I think he’s the right guy at the right time.”  The question of how Russia may have interfered in the election, and how Trump’s campaign may have had links with or co-ordinated any such eff

In [5]:
# Block 3: Vectorization
    

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(
    max_features=20000,   # maximum number of features
    ngram_range=(1,2),    # unigrams and bigrams
    min_df=5,             # ignore words that appear in fewer than 5 documents
    stop_words=None       # stopwords already removed
)

# Fit and transform the clean text to TF-IDF features
X = tfidf_vectorizer.fit_transform(df['clean_text'])

# Check the shape of the resulting matrix
print("TF-IDF matrix shape:", X.shape)


TF-IDF matrix shape: (39942, 20000)


In [6]:
# For cross-validation 
y = df['label'].values

In [7]:
# Cross-Validation with Naive Bayes

# Initialize Naive Bayes model
nb_model = MultinomialNB()

# Stratified K-Folds (5 folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation
nb_scores = cross_val_score(nb_model, X, y, cv=skf, scoring='accuracy')

# Results
print("Naive Bayes Cross-Validation Results")
print("Accuracy scores per fold:", nb_scores)
print("Mean accuracy:", nb_scores.mean())
print("Standard deviation:", nb_scores.std())

Naive Bayes Cross-Validation Results
Accuracy scores per fold: [0.95543873 0.95356115 0.95305458 0.95192789 0.95117677]
Mean accuracy: 0.9530318227372352
Standard deviation: 0.0014651824425785445


In [8]:
# Cross-Validation with Logistic Regression

# Initialize Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Stratified K-Folds (5 folds)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Cross-validation
lr_scores = cross_val_score(lr_model, X, y, cv=skf, scoring='accuracy')

# Results
print("Logistic Regression Cross-Validation Results")
print("Accuracy scores per fold:", lr_scores)
print("Mean accuracy:", lr_scores.mean())
print("Standard deviation:", lr_scores.std())


Logistic Regression Cross-Validation Results
Accuracy scores per fold: [0.99086244 0.9901114  0.98635453 0.98823235 0.98760641]
Mean accuracy: 0.9886334257927134
Standard deviation: 0.0016469296302669535


In [9]:
# Block 4: Save the TF-IDF vectorizer

# Save the TF-IDF vectorizer for future use
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

print("TF-IDF vectorizer saved successfully!")


TF-IDF vectorizer saved successfully!


In [10]:
#Block 4: Train-test split
from sklearn.model_selection import train_test_split

# Target variable
y = df['label']

# Split the dataset into training and test sets
# 80% training, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Print the shape of each set
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Training set shape: (31953, 20000) (31953,)
Test set shape: (7989, 20000) (7989,)


In [None]:
#Train and evaluate a Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# Initialize the Multinomial Naive Bayes model
nb_model = MultinomialNB()

# Measure training time
start_time = time.time()
nb_model.fit(X_train, y_train)
end_time = time.time()

print(f"Training time: {end_time - start_time:.4f} seconds")

# Predict on the test set
y_pred = nb_model.predict(X_test)

# Evaluate model performance

accuracy = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {accuracy:.4f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred))


Training time: 0.0307 seconds
Test set accuracy: 0.9509

Classification report:
              precision    recall  f1-score   support

           0       0.95      0.95      0.95      3989
           1       0.95      0.95      0.95      4000

    accuracy                           0.95      7989
   macro avg       0.95      0.95      0.95      7989
weighted avg       0.95      0.95      0.95      7989


Confusion matrix:
[[3801  188]
 [ 204 3796]]


In [12]:
#Train and evaluate a Logistic Regression model
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
# 'saga' solver is efficient for large datasets and supports L2 regularization
lr_model = LogisticRegression(
    solver='saga',    # efficient for large, sparse datasets
    max_iter=1000,    # increase iterations to ensure convergence
    random_state=42
)

# Measure training time
start_time = time.time()
lr_model.fit(X_train, y_train)
end_time = time.time()

print(f"Training time: {end_time - start_time:.4f} seconds")

# Predict on the test set
y_pred_lr = lr_model.predict(X_test)

# Evaluate model performance
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Test set accuracy: {accuracy_lr:.4f}")

print("\nClassification report:")
print(classification_report(y_test, y_pred_lr))

print("\nConfusion matrix:")
print(confusion_matrix(y_test, y_pred_lr))


Training time: 0.5611 seconds
Test set accuracy: 0.9875

Classification report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      3989
           1       0.99      0.99      0.99      4000

    accuracy                           0.99      7989
   macro avg       0.99      0.99      0.99      7989
weighted avg       0.99      0.99      0.99      7989


Confusion matrix:
[[3929   60]
 [  40 3960]]


In [13]:
#Save models
import joblib

# Save Multinomial Naive Bayes model
joblib.dump(nb_model, 'naive_bayes_model.pkl')

# Save Logistic Regression model
joblib.dump(lr_model, 'logistic_regression_model.pkl')

print("Both models saved successfully!")


Both models saved successfully!


In [14]:
# Block 5: Load validation dataset, preprocess, predict, and save results

# Load validation dataset

validation_df = pd.read_csv('/Users/nazb/VSCode101/project-nlp-challenge/dataset/validation_data.csv')

# Load saved models and vectorizer
nb_model = joblib.load('naive_bayes_model.pkl')
lr_model = joblib.load('logistic_regression_model.pkl')
tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Set stopwords

stop_words = set(stopwords.words('english'))

# Text cleaning function
def clean_text(text, acronyms=None):
    text = str(text).lower()
    if acronyms:
        for abbr in acronyms:
            text = text.replace(abbr, abbr.replace('.', ''))
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Preprocess validation data
validation_df['content'] = validation_df['title'] + ' ' + validation_df['text']

# Find acronyms with dots
all_text = ' '.join(validation_df['content'].astype(str))
pattern = r'\b(?:[A-Z]\.){2,}'
acronyms = set(re.findall(pattern, all_text))

# Clean text
validation_df['clean_text'] = validation_df['content'].apply(lambda x: clean_text(x, acronyms))


# Transform text using TF-IDF
X_validation = tfidf_vectorizer.transform(validation_df['clean_text'])

# Predict labels with both models
validation_df['label_nb'] = nb_model.predict(X_validation)
validation_df['label_lr'] = lr_model.predict(X_validation)

# Save predictions to CSV
# Naive Bayes
nb_df = validation_df.copy()
nb_df['label'] = nb_df['label_nb']
nb_df[['label', 'title', 'text', 'subject', 'date']].to_csv('validation_predictions_nb.csv', index=False)

# Logistic Regression
lr_df = validation_df.copy()
lr_df['label'] = lr_df['label_lr']
lr_df[['label', 'title', 'text', 'subject', 'date']].to_csv('validation_predictions_lr.csv', index=False)

print("Predictions saved successfully for both models!")


# Mini-report: distribution and comparison

print("\n--- Distribution of predictions ---")
print("Naive Bayes label counts:")
print(validation_df['label_nb'].value_counts())
print("\nLogistic Regression label counts:")
print(validation_df['label_lr'].value_counts())

# Compare both models
validation_df['agreement'] = validation_df['label_nb'] == validation_df['label_lr']
agreement_count = validation_df['agreement'].sum()
total_count = len(validation_df)
print(f"\nAgreement between models: {agreement_count}/{total_count} ({agreement_count/total_count:.2%})")

# Optional: check disagreements
disagreements = validation_df[validation_df['agreement'] == False][['title', 'label_nb', 'label_lr']]
print(f"\nNumber of disagreements: {len(disagreements)}")
if len(disagreements) > 0:
    print(disagreements.head(5))  # show first 5 examples


Predictions saved successfully for both models!

--- Distribution of predictions ---
Naive Bayes label counts:
label_nb
0    3144
1    1812
Name: count, dtype: int64

Logistic Regression label counts:
label_lr
0    3381
1    1575
Name: count, dtype: int64

Agreement between models: 4639/4956 (93.60%)

Number of disagreements: 317
                                                 title  label_nb  label_lr
179  Family of Australian woman fatally shot wants ...         0         1
212  UK police release new image of jogger in Londo...         0         1
226  More than 50 arrested for looting in Miami dur...         0         1
240  Draining the swamp: Hard-hit Everglades town m...         0         1
241  After Irma, a mixed journey home for Florida e...         0         1


In [15]:
# ------------------------------
# GENERATE FINAL SUBMISSION FILE
# ------------------------------

# Use Logistic Regression (better performance - 98.75% vs 95.09%)
final_output = validation_df[['title', 'text', 'subject', 'date']].copy()
final_output['label'] = validation_df['label_lr']  # Using best model

# Save final predictions
final_output.to_csv('validation_predictions_FINAL.csv', index=False)
print("Final submission file saved: validation_predictions_FINAL.csv")

# ------------------------------
# ACCURACY ESTIMATION REPORT
# ------------------------------

print("\n" + "="*50)
print("ACCURACY ESTIMATION REPORT")
print("="*50)

print(f"\nModel Performance on Test Set:")
print(f"  Logistic Regression: 98.75% accuracy")
print(f"  Naive Bayes: 95.09% accuracy")

print(f"\nExpected Performance on Validation Data:")
print(f"  Estimated Accuracy: 98.0-99.0%")
print(f"  Confidence: High - models show consistent performance")
print(f"  Selected Model: Logistic Regression")

print(f"\nValidation Predictions Distribution:")
print(f"  Class 0 (Fake): {sum(validation_df['label_lr'] == 0)} samples")
print(f"  Class 1 (Real): {sum(validation_df['label_lr'] == 1)} samples")

print(f"\nModel Agreement Analysis:")
agreement = sum(validation_df['label_nb'] == validation_df['label_lr'])
total = len(validation_df)
print(f"  Models agree on: {agreement}/{total} ({agreement/total:.1%}) samples")

Final submission file saved: validation_predictions_FINAL.csv

ACCURACY ESTIMATION REPORT

Model Performance on Test Set:
  Logistic Regression: 98.75% accuracy
  Naive Bayes: 95.09% accuracy

Expected Performance on Validation Data:
  Estimated Accuracy: 98.0-99.0%
  Confidence: High - models show consistent performance
  Selected Model: Logistic Regression

Validation Predictions Distribution:
  Class 0 (Fake): 3381 samples
  Class 1 (Real): 1575 samples

Model Agreement Analysis:
  Models agree on: 4639/4956 (93.6%) samples
