## Setting up all the datasets from Kaggle

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import os
os.environ["KAGGLE_CONFIG_DIR"] = '/content/gdrive/MyDrive/Kaggle'

In [3]:
%cd /content/gdrive/MyDrive/Kaggle

/content/gdrive/MyDrive/Kaggle


In [4]:
!kaggle competitions download -c fake-news

fake-news.zip: Skipping, found more recently modified local copy (use --force to force download)


In [5]:
!unzip fake-news.zip -d fake-news

Archive:  fake-news.zip
replace fake-news/submit.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [6]:
%cd fake-news

/content/gdrive/MyDrive/Kaggle/fake-news


## Importing libraries

In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the datasets

In [8]:
train = pd.read_csv('train.csv')

## Getting some informations about the train dataset

In [9]:
train.shape

(20800, 5)

In [10]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


## The type of each column in the train dataset

In [11]:
train.dtypes

id         int64
title     object
author    object
text      object
label      int64
dtype: object

## The number of missing values in each column of the train dataset

In [12]:
train.isna().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

## Handling missing values in the train dataset

In [13]:
# Droping all the rows containing missing values
train.dropna(axis=0, inplace=True)

In [14]:
train.isna().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [15]:
train.shape

(18285, 5)

# The number of words in each textual column of the train dataset

In [16]:
word_count = train[['title', 'author', 'text']].applymap(lambda x : len(str(x).split()))

In [17]:
word_count

Unnamed: 0,title,author,text
0,14,2,820
1,9,3,710
2,7,1,1266
3,10,2,557
4,14,2,154
...,...,...,...
20795,9,2,307
20796,11,2,1120
20797,15,8,801
20798,8,2,267


## Droping useless columns from the train dataset

In [18]:
train.drop(columns=['id', 'text'], inplace=True)

In [19]:
train.shape

(18285, 3)

## Cleaning textual columns of the train dataset

In [20]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def text_cleaning(text):
  text = re.sub('[^a-zA-Z]', ' ', text)
  text = text.lower()
  text = text.split()
  ps = PorterStemmer()
  text = ' '.join([ps.stem(word) for word in text if not word in stopwords.words('english')])
  return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
train[['title', 'author']] = train[['title', 'author']].applymap(lambda x: text_cleaning(x))

In [22]:
train.head()

Unnamed: 0,title,author,label
0,hous dem aid even see comey letter jason chaff...,darrel lucu,1
1,flynn hillari clinton big woman campu breitbart,daniel j flynn,0
2,truth might get fire,consortiumnew com,1
3,civilian kill singl us airstrik identifi,jessica purkiss,1
4,iranian woman jail fiction unpublish stori wom...,howard portnoy,1


## The independent and the dependent variable

In [23]:
X = train['title'].values + ' ' + train['author'].values
y = train['label'].values

In [24]:
print(X.shape)
print(y.shape)

(18285,)
(18285,)


## Converting the textual data to numerical data

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [26]:
print(X)

  (0, 8310)	0.3609049070394367
  (0, 3359)	0.3609049070394367
  (0, 14626)	0.2853880981846006
  (0, 2312)	0.3745612250433202
  (0, 7190)	0.24556189342497173
  (0, 8048)	0.29347549279156676
  (0, 2757)	0.2466340295002162
  (0, 12567)	0.25566372256502734
  (0, 4637)	0.23016077319140021
  (0, 247)	0.26982554594264346
  (0, 3543)	0.2684494960336511
  (0, 6552)	0.21745594418933306
  (1, 3328)	0.2623789770430963
  (1, 1764)	0.15099851642776987
  (1, 2066)	0.3819189043603919
  (1, 15663)	0.30530279633389806
  (1, 1391)	0.2961798071396214
  (1, 2619)	0.1936832753563371
  (1, 6377)	0.19285723710368194
  (1, 5140)	0.7119376870709987
  (2, 2743)	0.31437590974242646
  (2, 2895)	0.4581003415623782
  (2, 5031)	0.38709995799949964
  (2, 5579)	0.3490632212946542
  (2, 8973)	0.4948460479407663
  :	:
  (18282, 1194)	0.33325787344622426
  (18282, 12239)	0.2527439079680461
  (18282, 11515)	0.2748252773264482
  (18282, 13966)	0.30927367322621613
  (18282, 11321)	0.24588400571511218
  (18282, 8879)	0.292964

## Spliting the dataset into the training set and the test set

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [28]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(14628, 15960)
(3657, 15960)
(14628,)
(3657,)


## Training the logistic regression model on the training set

In [29]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

## Evaluating the model on the training set

In [33]:
from sklearn.metrics import accuracy_score
y_pred_train = classifier.predict(X_train)
score_train = accuracy_score(y_pred_train, y_train)
print(score_train)

0.9901558654634947


## Evaluating the model on the test set

In [34]:
y_pred_test = classifier.predict(X_test)
score_test = accuracy_score(y_pred_test, y_test)
print(score_test)

0.9827727645611156


## Making predictive system for one example

In [35]:
X_new = X_test[0]
y_pred = classifier.predict(X_new)
if y_pred == 0:
  print("The news is real!")
else:
  print("The news is fake!")

The news is fake!
