# Fake news detection using Support Vector Machine (SVM) 
This model checks whether a news article is fake or real. In this case, LinearSVC **(Linear Support Vector Classifier)** is used

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
import pandas as pd
import numpy as np
import random

In [2]:
dataset = pd.read_csv('dataset/news-articles.csv')

In [3]:
dataset.head(n=10)

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1
5,5,Jackie Mason: Hollywood Would Love Trump if He...,Daniel Nussbaum,"In these trying times, Jackie Mason is the Voi...",0
6,6,Life: Life Of Luxury: Elton John’s 6 Favorite ...,,Ever wonder how Britain’s most iconic pop pian...,1
7,7,Benoît Hamon Wins French Socialist Party’s Pre...,Alissa J. Rubin,"PARIS — France chose an idealistic, traditi...",0
8,8,Excerpts From a Draft Script for Donald Trump’...,,Donald J. Trump is scheduled to make a highly ...,0
9,9,"A Back-Channel Plan for Ukraine and Russia, Co...",Megan Twohey and Scott Shane,A week before Michael T. Flynn resigned as nat...,0


### Dataset features and target

In [4]:
dataset.shape

(20800, 5)

### Check for missing values

In [5]:
dataset.drop(columns='id').isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

### Cleaning data
Replacing null values with an empty string

In [6]:
dataset = dataset.fillna('')

In [7]:
# Check for any missing values after data cleaning

dataset.drop(columns='id').isnull().sum()

title     0
author    0
text      0
label     0
dtype: int64

## Separate feature names and target

In [8]:
X = dataset["text"]
y = dataset["label"]

In [9]:
print(X)

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        Ever get the feeling your life circles the rou...
2        Why the Truth Might Get You Fired October 29, ...
3        Videos 15 Civilians Killed In Single US Airstr...
4        Print \nAn Iranian woman has been sentenced to...
                               ...                        
20795    Rapper T. I. unloaded on black celebrities who...
20796    When the Green Bay Packers lost to the Washing...
20797    The Macy’s of today grew from the union of sev...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799      David Swanson is an author, activist, journa...
Name: text, Length: 20800, dtype: object


In [10]:
print(y)

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64


## Split data into training data and testing data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

### Check how much features are used for training and testing

In [12]:
print(X.shape, X_train.shape, X_test.shape)

(20800,) (16640,) (4160,)


### **NOTE**

**16640** is used to train the model (training data)
**4160** is used to test the model (testing data)

### Vectorize training data and testing data using **TfidfVectorizer**

In [13]:
vectorizer = TfidfVectorizer(stop_words="english")

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

### creating an instance of LinearSVC 

In [14]:
model = LinearSVC(dual='auto')

In [15]:
model.fit(X_train_vectorized, y_train)

### Accuracy score for the model

In [16]:
# 1. Accuracy score (training data)
X_train_prediction = model.predict(X_train_vectorized)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [17]:
print(f'Accuracy score (training data): {training_data_accuracy}')

Accuracy score (training data): 0.9993990384615384


### The code block below can also be used to measure accuracy score

In [18]:
model.score(X_train_vectorized, y_train)

0.9993990384615384

In [19]:
# 2. Accuracy score (testing data)
model.score(X_test_vectorized, y_test)

0.9574519230769231

### Check how many articles used as training data were classified correctly.

In [20]:
# training data

acc_score_ = model.score(X_train_vectorized, y_train)  # accuracy score
classified_articles = round(X_train.shape[0] * acc_score_)
total_articles = X_train.shape[0]  # return the first item in the tuple

print(f'Articles: {classified_articles} articles out of {total_articles} articles')  # articles classified correctly
print()
print(f'Articles classified incorrectly: {total_articles - classified_articles} articles.')

Articles: 16630 articles out of 16640 articles

Articles classified incorrectly: 10 articles.


### Check how many articles used as testing data were classified correctly.

In [21]:
# testing data


acc_score_ = model.score(X_test_vectorized, y_test)  # accuracy score
classified_articles = round(X_test.shape[0] * acc_score_)
total_articles = X_test.shape[0]  # return the first item in the tuple

print(f'Articles: {classified_articles} articles out of {total_articles} articles')  # articles classified correctly
print()
print(f'Articles classified incorrectly: {total_articles - classified_articles} articles.')

Articles: 3983 articles out of 4160 articles

Articles classified incorrectly: 177 articles.


## Build a predictive system

In [42]:
random_article = random.randrange(0, X.shape[0])

print(f'Random article: {random_article} | {X.shape[0]}')

# pick a random news article and save it in a text file
with open('test-articles.txt', 'w', encoding='utf-8') as file:
    file.write(X_test.iloc[random_article])

with open('test-articles.txt', 'r', encoding='utf-8') as file:
    news_article = file.read()

prediction = vectorizer.transform([news_article])

result = model.predict(prediction)

print(result)

if result[0] == 0:
    print("Authentic news article.")
else:
    print("FAKE news article!!!")

Random article: 993 | 20800
[0]
Authentic news article.


In [43]:
X_test.iloc[random_article]

'You must lose weight, a doctor told Sarah Bramblette, advising a     diet. But Ms. Bramblette had a basic question: How much do I weigh? The doctor’s scale went up to 350 pounds, and she was heavier than that. If she did not know the number, how would she know if the diet was working? The doctor had no answer. So Ms. Bramblette, 39, who lived in Ohio at the time, resorted to a solution that made her burn with shame. She drove to a nearby junkyard that had a scale that could weigh her. She was 502 pounds. One in three Americans is obese, a rate that has been steadily growing for more than two decades, but the health care system  —   in its attitudes, equipment and common practices  —   is ill prepared, and its practitioners are often unwilling, to treat the rising population of fat patients. The difficulties range from scales and scanners, like M. R. I. machines that are not built big enough for very heavy people, to surgeons who categorically refuse to give knee or hip replacements to

In [None]:
y_test.iloc[random_article]