In [None]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
import string

# Download stopwords for preprocessing
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip
!unzip smsspamcollection.zip


In [None]:
# Load the dataset
df = pd.read_csv('news_dataset.csv')
df.head()


Unnamed: 0,Labels,Articles
0,fake,A longtime champion of the homeless and batter...
1,fake,"Tucked away in the Marais, two warring groups ..."
2,fake,There are plenty of things that can impede wom...
3,fake,"New York City is home to more than 2,500 tiny ..."
4,fake,A man wearing a hat emblazoned with the words ...


### **Text Preprocessing**

Before we can apply machine learning algorithms to our dataset, we need to clean the text data. Raw text often contains noise, such as punctuation, stopwords, and case differences, which can affect model performance. In this step, we'll define a function to:

- Remove punctuation.
- Convert all text to lowercase.
- Remove common English stopwords that do not contribute meaningful information.

We will apply this preprocessing function to the SMS messages to prepare them for feature extraction.

In [None]:
# Define a function to clean the text
def text_preprocessing(text):
    # Remove punctuation
    text = ''.join([char for char in text if char not in string.punctuation])

    # Convert to lowercase
    text = text.lower()

    # Remove stopwords
    stop_words = stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text

# Apply preprocessing to the message column
df['Cleaned_Articles'] = df['Articles'].apply(text_preprocessing)
df.head()


Unnamed: 0,Labels,Articles,Cleaned_Articles
0,fake,A longtime champion of the homeless and batter...,longtime champion homeless battered times unio...
1,fake,"Tucked away in the Marais, two warring groups ...",tucked away marais two warring groups compete ...
2,fake,There are plenty of things that can impede wom...,plenty things impede women’s road career succe...
3,fake,"New York City is home to more than 2,500 tiny ...",new york city home 2500 tiny churches yearroun...
4,fake,A man wearing a hat emblazoned with the words ...,man wearing hat emblazoned words “driving libe...


### **Convert Text Data to Bag of Words**

In this step, we will convert the preprocessed text data into numerical features using the **Bag of Words (BoW)** technique. For this, we will use `CountVectorizer` from the `scikit-learn` library.

- The `CountVectorizer` will tokenize the cleaned text data and convert each message into a vector of word counts.
- We will also convert the labels from categorical ('ham' or 'spam') into binary values (0 for 'ham' and 1 for 'spam') to prepare them for model training.


In [None]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the cleaned message column
X = vectorizer.fit_transform(df['Cleaned_Articles'])

# Convert labels to binary (0 for ham, 1 for spam)
y = df['Labels'].apply(lambda x: 1 if x == 'fake' else 0)


In [None]:
df

In [None]:
pattern = r'government|administration|parliament|congress|senate|legislature|cabinet|policy|governance|regulation|reform|executive order|law|bill|amendment|ruling|constitution|judiciary|Supreme Court|election|campaign|vote|ballot|voting|polling|referendum|conservative|liberal|democrat|republican|socialist|communist|progressive|nationalist|populist|libertarian|left-wing|right-wing|centrist|far-right|far-left|president|prime minister|chancellor|senator|representative|governor|mayor|lawmaker|candidate|nominee|politician|leader|opposition|incumbent|ambassador|secretary|minister|foreign policy|diplomacy|trade|treaty|sanction|embargo|negotiation|alliance|international relations|United Nations|NATO|summit|global|foreign aid|bilateral|multilateral|conflict|war|peace talks|corruption|scandal|impeachment|investigation|probe|inquiry|ethics|fraud|bribery|indictment|collusion|abuse of power|obstruction|cover-up|whistleblower|leak|conspiracy|allegations|legislation|act|veto|budget|appropriations|funding|taxes|tax cut|tax reform|healthcare|education|immigration|infrastructure|environmental policy|social security|welfare|climate change|defense policy|economy|recession|inflation|deficit|debt|spending|stimulus|tariffs|trade agreement|jobs|employment|labor|min wage|wealth inequality|economic policy|federal reserve|central bank|interest rates|GDP|defense|military|security|national security|Pentagon|armed forces|terrorism|homeland security|intelligence|cybersecurity|counterterrorism|military operations|veterans|weapons|nuclear|missile|army|navy|air force|civil rights|human rights|equality|social justice|race relations|gender equality|LGBTQ+|voting rights|gun control|abortion|privacy|surveillance|law enforcement|protest|demonstration|activism|EU|WTO|IMF|World Bank|G7|G20|climate accord|peace treaty|international agreement|lawsuit|litigation|trial|judge|verdict|justice|attorney general|prosecutor|indictment|judicial review|civil rights case|legal battle|climate change|global warming|carbon emissions|environmental regulation|green energy|renewable energy|pollution|EPA|Paris Agreement|conservation|deforestation|fossil fuels|poll|polling|approval rating|public opinion|survey|margin of error|exit poll|opinion poll|focus group|electorate|bureaucracy|bipartisanship|partisan|polarization|lobbyist|lobbying|caucus|convention|executive branch|judicial branch|legislative branch|federal|state|local government'

def keyword_count(text, pattern):
    matches = re.findall(pattern, text, flags=re.IGNORECASE)
    return len(matches)

df['keyword_matches'] = df['Articles'].apply(lambda x: keyword_count(x, pattern))

political_articles = df[df['keyword_matches'] >= 10]

In [None]:
even_political_articles = pd.concat([political_articles[political_articles['Labels'] == "fake"][:11960], political_articles[political_articles['Labels'] == "real"]])

In [None]:
even_political_articles.to_csv("balanced_political_articles.csv", index=False)

### **Train-Test Split**

To evaluate the performance of our machine learning model, we need to split the dataset into two parts:
- **Training set**: Used to train the model.
- **Test set**: Used to evaluate the model's performance on unseen data.

We will use the `train_test_split` function from `scikit-learn` to split the data into training and test sets, with 80% of the data for training and 20% for testing. The `random_state` parameter ensures reproducibility of the results.


In [None]:
X = vectorizer.fit_transform(even_political_articles['Cleaned_Articles'])

# Convert labels to binary (0 for ham, 1 for spam)
y = even_political_articles['Labels'].apply(lambda x: 1 if x == 'fake' else 0)

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting data
X_train.shape, X_test.shape


((19136, 171001), (4784, 171001))

### **Model Training: Logistic Regression**

In this step, we will initialize and train a **Logistic Regression** classifier. Logistic Regression is a widely used algorithm for binary classification tasks, such as distinguishing between "spam" and "ham" messages.

- The model is initialized with a maximum iteration parameter (`max_iter=1000`) to ensure the algorithm has enough iterations to converge during training.
- We then fit the model to the training data (`X_train`, `y_train`) to learn the patterns and relationships in the text features.



In [None]:
# Initialize Logistic Regression classifier
lr = LogisticRegression(max_iter=1000)  # Increase max_iter if you encounter convergence issues

# Train the classifier
lr.fit(X_train, y_train)


### **Model Evaluation**

After training the Logistic Regression model, we will now evaluate its performance on the test set.

- **Predictions**: Using the trained model, we make predictions (`y_pred`) on the test set (`X_test`).
- **Accuracy**: We calculate the accuracy of the model, which is the percentage of correct predictions out of the total number of predictions.
- **Classification Report**: We generate a detailed classification report, which includes precision, recall, f1-score, and support for both classes (spam and ham).

The evaluation results will help us understand how well the model generalizes to unseen data.


In [None]:
# Make predictions on the test set
y_pred = lr.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# Print accuracy and classification report
print(f'Accuracy: {accuracy * 100:.2f}%')
print('Classification Report:')
print(report)


Accuracy: 96.99%
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2367
           1       0.98      0.96      0.97      2417

    accuracy                           0.97      4784
   macro avg       0.97      0.97      0.97      4784
weighted avg       0.97      0.97      0.97      4784



### **Model Performance**

The Logistic Regression model achieved the following performance on the test set:

- **Accuracy**: 96.99%

#### **Classification Report**:

| Class | Precision | Recall | F1-Score | Support |
|-------|-----------|--------|----------|---------|
| Real (0) | 0.96 | 0.98 | 0.97 | 2367 |
| Fake (1) | 0.98 | 0.96 | 0.97 | 2417 |

- **Macro Avg**:
  - Precision: 0.97
  - Recall: 0.97
  - F1-Score: 0.97
- **Weighted Avg**:
  - Precision: 0.97
  - Recall: 0.97
  - F1-Score: 0.97

### **Conclusion**

The Logistic Regression model performed exceptionally well with an accuracy of 96.99%. The high precision and recall for the "real" class demonstrate that the model is very effective at correctly identifying real news articles. For the "fale" class, the model still shows strong performance, as reflected by the F1-score of 0.97. Overall, this model is highly reliable for fake detection in political news articles.


**Support Vector Machine Algorithm**

In [1]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
import pandas as pd
from statistics import mean

In [None]:
political_news = pd.read_csv("politicalnews.csv")
political_news.head()

In [None]:
even_political_articles = pd.concat([political_news[political_news['Labels'] == "fake"][:11960], political_news[political_news['Labels'] == "real"]])

In [None]:
even_political_articles

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(even_political_articles['Cleaned_Articles'])

# Convert labels to binary (0 for ham, 1 for spam)
y = even_political_articles['Labels'].apply(lambda x: 1 if x == 'fake' else 0)

In [None]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape of the resulting data
X_train.shape, X_test.shape


In [None]:
# Initialize the Support Vector Classifier with a linear kernel
svm_model = SVC(kernel='linear', random_state=42)

# Train the SVM model on the training data
svm_model.fit(X_train, y_train)

In [None]:
# Predict the labels for the test set
y_pred = svm_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Initialize the Support Vector Classifier with a linear kernel
svm_model = SVC(kernel='poly', random_state=42)

# Train the SVM model on the training data
svm_model.fit(X_train, y_train)

In [None]:
y_pred = svm_model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
# Cross Validation

svm_model_linear = SVC(kernel='linear', random_state=42)
svm_model_poly = SVC(kernel='poly', random_state=42)
svm_model_rbf = SVC(kernel='rbf', random_state=42)
linear_model = LogisticRegression(max_iter=1000)

svm_linear_scores = cross_val_score(svm_model_linear, X, y)
svm_poly_scores = cross_val_score(svm_model_poly, X, y)
svm_rbf_scores = cross_val_score(svm_model_rbf, X, y)
linear_scores = cross_val_score(linear_model, X, y)

print(f"Mean accuracy for all models: {mean(svm_linear_scores)}, {mean(svm_poly_scores)}, {mean(svm_rbf_scores)}, {mean(linear_scores)}.")