Install required packages

In [None]:
!pip install scikit-learn
!pip install pandas

Imports

In [2]:
import pandas as pd

# Data processing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Model analysis
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

# Classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

## Data processing

In [3]:
# Read in the CSVs
real = pd.read_csv('True.csv')
fake = pd.read_csv('Fake.csv')

In [4]:
# Quick look
real.head(6)

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"
5,"White House, Congress prepare for talks on spe...","WEST PALM BEACH, Fla./WASHINGTON (Reuters) - T...",politicsNews,"December 29, 2017"


In [5]:
# Assign categorisation
real['real'] = True
fake['real'] = False

In [6]:
# Concat datasets
data = pd.concat([real, fake], axis=0).reset_index(drop=True)

In [7]:
data.tail(5)

Unnamed: 0,title,text,subject,date,real
44893,McPain: John McCain Furious That Iran Treated ...,21st Century Wire says As 21WIRE reported earl...,Middle-east,"January 16, 2016",False
44894,JUSTICE? Yahoo Settles E-mail Privacy Class-ac...,21st Century Wire says It s a familiar theme. ...,Middle-east,"January 16, 2016",False
44895,Sunnistan: US and Allied ‘Safe Zone’ Plan to T...,Patrick Henningsen 21st Century WireRemember ...,Middle-east,"January 15, 2016",False
44896,How to Blow $700 Million: Al Jazeera America F...,21st Century Wire says Al Jazeera America will...,Middle-east,"January 14, 2016",False
44897,10 U.S. Navy Sailors Held by Iranian Military ...,21st Century Wire says As 21WIRE predicted in ...,Middle-east,"January 12, 2016",False


In [8]:
# Shuffle
data = data.sample(frac=1)

## Modelling

In [10]:

x = data['title'].to_list()
y = data['real'].to_list()

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1)

In [12]:
vectorization = TfidfVectorizer()
xv_train = vectorization.fit_transform(x_train)
xv_test = vectorization.transform(x_test)

###### Decision tree classifier

In [13]:
DT = DecisionTreeClassifier()
DT.fit(xv_train, y_train)

In [57]:
pred_dt = DT.predict(xv_test)

In [14]:
DT.score(xv_test, y_test)

0.9229398663697105

###### Gradient boost classifier

In [15]:
GB = GradientBoostingClassifier(random_state = 0)
GB.fit(xv_train, y_train)

In [62]:
pred_gb = GB.predict(xv_test)

In [16]:
GB.score(xv_test, y_test)

0.8879732739420936

###### Random forest classifier

In [17]:
RF = RandomForestClassifier(random_state = 0)
RF.fit(xv_train, y_train)

In [46]:
pred_rf = RF.predict(xv_test)

In [18]:
RF.score(xv_test, y_test)

0.9543429844097996

###### Logistic Regression Classifier

In [19]:
LR = LogisticRegression()
LR.fit(xv_train, y_train)

In [None]:
pred_lr = LR.predict(xv_test)

In [20]:
LR.score(xv_test, y_test)

0.9523385300668151

In [30]:
title = "Hillary Clinton suggests jailing Americans for posting 'misinformation'"
inpt = vectorization.transform([title])
RF.predict(inpt)

np.False_

In [29]:
title = "Starmer says US 'backstop' needed for Ukraine peace deal"
inpt = vectorization.transform([title])
RF.predict(inpt)

array([ True])