In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib
import optuna
from optuna.samplers import TPESampler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

# Suppress Optuna logs
optuna.logging.set_verbosity(optuna.logging.WARNING)

/kaggle/input/source-based-news-classification/news_articles.csv


## Load dataset

In [2]:
data = pd.read_csv('/kaggle/input/source-based-news-classification/news_articles.csv')
data

Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2091,-NO AUTHOR-,2016-10-27T15:36:10.573+03:00,teens walk free after gangrape conviction,,english,wnd.com,http://www.wnd.com/files/2016/10/hillary_haunt...,bias,Real,good samaritan wearing indian headdress disarm...,,1.0
2092,-NO AUTHOR-,2016-10-27T15:36:10.671+03:00,school named for munichmassacre mastermind,,english,wnd.com,http://www.wnd.com/files/2016/10/rambo_richard...,bias,Real,skype sex scam fortune built shame,,1.0
2093,-NO AUTHOR-,2016-10-27T13:30:00.000+03:00,russia unveils satan missile,,english,wnd.com,http://www.wnd.com/files/2016/10/skype_sex_sca...,bs,Fake,cannabis aficionados develop thca crystalline ...,,1.0
2094,-NO AUTHOR-,2016-10-27T15:58:41.935+03:00,check out hillarythemed haunted house,,english,wnd.com,http://worldtruth.tv/wp-content/uploads/2016/1...,bs,Fake,title,,0.0


## Data preprocessing

In [3]:
# Select the 'text' and 'label' columns, drop any missing values
data = data[['text', 'label']].dropna()
data

Unnamed: 0,text,label
0,print they should pay all the back all the mon...,Real
1,why did attorney general loretta lynch plead t...,Real
2,red state \nfox news sunday reported this mor...,Real
3,email kayla mueller was a prisoner and torture...,Real
4,email healthcare reform to make america great ...,Real
...,...,...
2045,check out hillarythemed haunted house anticlin...,Real
2046,good samaritan wearing indian headdress disarm...,Real
2047,skype sex scam a fortune built on shame moroc...,Real
2048,posted by eddie while the skyhigh potency may ...,Real


## Splitting data into training and testing sets


In [4]:
X = data['text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test

(1857    i think megan kelly has screwed everyone neces...
 570     email \n\naccording to the jasta law which all...
 926     home  news  trump tsunami incoming what trump ...
 670     email \nget ready for the most cringeworthy st...
 1674    at the third debate donald trump was asked if ...
                               ...                        
 1724    gentlemen donald j trump is the new presidente...
 1095    time investigating hillary is an attack on all...
 1130    the unraveling of hillarys corrupt sphere of i...
 1294    contaminated food from china now entering the ...
 860     october  daily contrarian reads by david stock...
 Name: text, Length: 1640, dtype: object,
 1808    home  be the change  government corruption  pr...
 694     podcast play in new window  download  embed \n...
 906     new leaked clinton emails came from the device...
 544     email \n\nhillary supporter robert dougherty f...
 1847    id love to see clinton spend all her money and...
              

## Define the model optimization process using Optuna


In [5]:
def objective(trial):
    # Hyperparameters for tuning
    C = trial.suggest_float("C", 1e-5, 10.0, log=True)
    max_df = trial.suggest_float("max_df", 0.5, 1.0)
    stop_words = 'english'
    
    # Create the pipeline: TF-IDF and Logistic Regression
    model_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words=stop_words, max_df=max_df)),
        ('log_reg', LogisticRegression(C=C, max_iter=1000, random_state=42))
    ])

    # Perform cross-validation
    score = cross_val_score(model_pipeline, X_train, y_train, n_jobs=-1, cv=3, scoring="accuracy")
    return score.mean()

# Create Optuna study
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=30)


## Best hyperparameters


In [6]:
best_params = study.best_params
best_params

{'C': 8.467210363394905, 'max_df': 0.502909431035011}

## Build the final model using the best hyperparameters


In [7]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=best_params['max_df'])
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

## Train the Logistic Regression model with optimized hyperparameters


In [8]:
model = LogisticRegression(C=best_params['C'], max_iter=1000, random_state=42)
model.fit(X_train_tfidf, y_train)


## Model evaluation

In [9]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(report)

Accuracy: 0.7634146341463415
Classification Report:
              precision    recall  f1-score   support

        Fake       0.76      0.89      0.82       249
        Real       0.77      0.57      0.65       161

    accuracy                           0.76       410
   macro avg       0.77      0.73      0.74       410
weighted avg       0.76      0.76      0.75       410



## Save the model and vectorizer

In [10]:
joblib.dump(model, 'optimized_fake_news_detection_model.pkl')
joblib.dump(tfidf_vectorizer, 'optimized_tfidf_vectorizer.pkl')


['optimized_tfidf_vectorizer.pkl']

## Testing function to predict new examples

In [11]:
def test_model(examples):
    examples_tfidf = tfidf_vectorizer.transform(examples)
    predictions = model.predict(examples_tfidf)
    return predictions


## Example test over the dataset

In [12]:
test_examples = X_test[:5]
predicted_labels = test_model(test_examples)

## Display test results

In [13]:
test_results = pd.DataFrame({
    'Text': test_examples,
    'Predicted Label': predicted_labels,
    'Actual Label': y_test[:5].values
})

print(test_results)

                                                   Text Predicted Label  \
1808  home  be the change  government corruption  pr...            Fake   
694   podcast play in new window  download  embed \n...            Fake   
906   new leaked clinton emails came from the device...            Fake   
544   email \n\nhillary supporter robert dougherty f...            Fake   
1847  id love to see clinton spend all her money and...            Fake   

     Actual Label  
1808         Real  
694          Fake  
906          Fake  
544          Fake  
1847         Fake  


## Save the test results

In [14]:
test_results.to_csv('test_results.csv', index=False)

print("Model and vectorizer saved as 'optimized_fake_news_detection_model.pkl' and 'optimized_tfidf_vectorizer.pkl'.")


Model and vectorizer saved as 'optimized_fake_news_detection_model.pkl' and 'optimized_tfidf_vectorizer.pkl'.
