In [2]:
from keras.models import Model
from keras.layers import Input, Dense, Concatenate
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
import numpy as np
import pandas as pd
import warnings
from sklearn.exceptions import DataConversionWarning

# Suppress scikit-learn and other warnings
warnings.filterwarnings(action='ignore', category=UserWarning)

2023-10-20 10:58:25.328559: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
df = pd.read_csv("train.csv")
df['author'].fillna('Unknown', inplace=True)
df['title'].fillna('Ambiguous', inplace=True)
df['text'].fillna('Ambiguous', inplace=True)
df.drop_duplicates(inplace=True)

# TF-IDF Vectorization for text and title
vectorizer_text = TfidfVectorizer(max_features=5000,ngram_range=(1, 3))
X_text = vectorizer_text.fit_transform(df['text']).toarray()

vectorizer_title = TfidfVectorizer(max_features=1000,ngram_range=(1, 3))
X_title = vectorizer_title.fit_transform(df['title']).toarray()

# One-hot encoding for authors
encoder = OneHotEncoder()
X_author = encoder.fit_transform(df[['author']]).toarray()


In [5]:
# Train-test split
y = df['label'].values
X_train_text, X_test_text, X_train_title, X_test_title, X_train_author, X_test_author, y_train, y_test = train_test_split(
    X_text, X_title, X_author, y, test_size=0.2, random_state=42)

# Neural Network Architecture
input_text = Input(shape=(X_train_text.shape[1],))
input_title = Input(shape=(X_train_title.shape[1],))
input_author = Input(shape=(X_train_author.shape[1],))

# Layers for text
x1 = Dense(128, activation='relu')(input_text)
x1 = Dense(64, activation='relu')(x1)

# Layers for title
x2 = Dense(128, activation='relu')(input_title)
x2 = Dense(64, activation='relu')(x2)

# Layers for author
x3 = Dense(128, activation='relu')(input_author)
x3 = Dense(64, activation='relu')(x3)

# Concatenate
concat = Concatenate()([x1, x2, x3])

# Final layers
out = Dense(64, activation='relu')(concat)
out = Dense(1, activation='sigmoid')(out)

# Compile model
model = Model(inputs=[input_text, input_title, input_author], outputs=out)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
model.fit([X_train_text, X_train_title, X_train_author], y_train, epochs=40, batch_size=32)

# Evaluate model
score = model.evaluate([X_test_text, X_test_title, X_test_author], y_test)
print(f"Test Accuracy: {score[1]}")

y_pred = model.predict([X_test_text, X_test_title, X_test_author])
y_pred = np.round(y_pred).flatten()  # Round the probabilities to get binary class labels

# Print classification report
print(classification_report(y_test, y_pred))

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40
Test Accuracy: 0.998317301273346
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      2132
           1       1.00      1.00      1.00      2028

    accuracy                           1.00      4160
   macro avg       1.00      1.00      1.00      4160
weighted avg       1.00      1.00      1.00      4160



In [6]:
print(f"Test Accuracy: {score[1]}")

Test Accuracy: 0.998317301273346


In [26]:
def predict_article(title, text, author, model, vectorizer_text, vectorizer_title, encoder):
    # Transform title and text using the respective TfidfVectorizers
    X_title = vectorizer_title.transform([title]).toarray()
    X_text = vectorizer_text.transform([text]).toarray()

    # Transform the author using OneHotEncoder
    try:
        X_author = encoder.transform([[author]]).toarray()
    except:
        # If the author is not recognized from the training data
        X_author = np.zeros((1, len(encoder.categories_[0])))

    # Predict using the trained model
    prediction_prob = model.predict([X_text, X_title, X_author])
    
    # Return the binary prediction (0 for real news and 1 for fake news)
    return int(np.round(prediction_prob)[0][0])

# Example usage
title = "Aliens Land in Central Park!"
text = ("In a surprising turn of events, extraterrestrial beings made contact with Earth by landing their spaceship in New York's Central Park. "
       "Thousands of onlookers watched in awe as the unidentified creatures emerged, announcing their peaceful intentions. "
       "Authorities have quarantined the area and are in talks with the visitors.")
author = "Jacob"

prediction = predict_article(title, text, author, model, vectorizer_text, vectorizer_title, encoder)
print("Fake News" if prediction else "Real News")

Fake News


In [27]:
import numpy as np
import pandas as pd

def predict_all_articles(df, model, vectorizer_text, vectorizer_title, encoder):
    df['author'].fillna('Unknown', inplace=True)
    df['title'].fillna('Ambiguous', inplace=True)
    df['text'].fillna('Ambiguous', inplace=True)
    
    # Transform title and text for all articles
    X_title = vectorizer_title.transform(df['title']).toarray()
    X_text = vectorizer_text.transform(df['text']).toarray()

    # Transform the authors
    try:
        X_author = encoder.transform(df[['author']]).toarray()
    except:
        # If the author is not recognized from the training data
        X_author = np.zeros((df.shape[0], len(encoder.categories_[0])))
    
    # Make batch predictions
    prediction_probs = model.predict([X_text, X_title, X_author])
    
    # Round the probabilities to get binary class labels
    predictions = np.round(prediction_probs).flatten().astype(int)
    
    # Map 0 and 1 to "Real News" and "Fake News"
    prediction_labels = ["Real News" if p == 0 else "Fake News" for p in predictions]
    
    # Add a new column to the original DataFrame to store predictions
    df['Prediction'] = prediction_labels
    
    return df

def get_accuracy(df):
    # Map "Real News" and "Fake News" back to 0 and 1
    df['PredictionLabel'] = df['Prediction'].map({"Real News": 0, "Fake News": 1})
    
    # Calculate the number of correct predictions
    correct_predictions = df[df['label'] == df['PredictionLabel']].shape[0]
    
    # Calculate the total number of predictions
    total_predictions = df.shape[0]
    
    # Calculate the accuracy
    accuracy = (correct_predictions / total_predictions) * 100
    
    print(f"Accuracy: {accuracy}%")


In [28]:
df_test = pd.read_csv("test.csv")
df_submit = pd.read_csv("submit.csv")
# Combine test and submit datasets based on 'id'
df_test = pd.merge(df_test, df_submit, on='id')

df_with_predictions = predict_all_articles(df_test, model, vectorizer_text, vectorizer_title, encoder)
get_accuracy(df_with_predictions)


Accuracy: 67.75%


In [29]:
df_test_1 = pd.read_csv("Combined_modified.csv")
df_with_predictions_1 = predict_all_articles(df_test_1, model, vectorizer_text, vectorizer_title, encoder)
get_accuracy(df_with_predictions_1)

Accuracy: 52.360906944630045%
