# Fake And Real News Detection Data Preprocessing

In [2]:
# Importing the required libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import re
import pickle


In [3]:
# importing the dataset
data = pd.read_csv('../data/fake_and_real_data.csv')
data.head()

Unnamed: 0,Text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,Fake
1,U.S. conservative leader optimistic of common ...,Real
2,"Trump proposes U.S. tax overhaul, stirs concer...",Real
3,Court Forces Ohio To Allow Millions Of Illega...,Fake
4,Democrats say Trump agrees to work on immigrat...,Real


In [4]:
data.shape
data.describe()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9900 entries, 0 to 9899
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Text    9900 non-null   object
 1   label   9900 non-null   object
dtypes: object(2)
memory usage: 154.8+ KB


In [5]:
# Checking for missing values

data.isnull().sum()



Text     0
label    0
dtype: int64

In [6]:
# Checking for duplicates

data.duplicated().sum()

35

In [7]:
# handle duplicated values
data.drop_duplicates(inplace=True)


data['is_fake'] = 0


data.head

<bound method NDFrame.head of                                                    Text label  is_fake
0      Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake        0
1     U.S. conservative leader optimistic of common ...  Real        0
2     Trump proposes U.S. tax overhaul, stirs concer...  Real        0
3      Court Forces Ohio To Allow Millions Of Illega...  Fake        0
4     Democrats say Trump agrees to work on immigrat...  Real        0
...                                                 ...   ...      ...
9895   Wikileaks Admits To Screwing Up IMMENSELY Wit...  Fake        0
9896  Trump consults Republican senators on Fed chie...  Real        0
9897  Trump lawyers say judge lacks jurisdiction for...  Real        0
9898   WATCH: Right-Wing Pastor Falsely Credits Trum...  Fake        0
9899   Sean Spicer HILARIOUSLY Branded As Chickensh*...  Fake        0

[9865 rows x 3 columns]>

In [8]:
# Preprocessing the data
data['is_fake'] = (data['label'] == 'Fake').astype(int)


data.drop(['label'], axis=1, inplace=True)

In [9]:
# Dropping the unnecessary columns
data.head()
# converting the text to lowercase
def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

data['Text'] = data['Text'].apply(preprocess)
data



Unnamed: 0,Text,is_fake
0,top trump surrogate brutally stabs him in the ...,1
1,u s conservative leader optimistic of common g...,0
2,trump proposes u s tax overhaul stirs concerns...,0
3,court forces ohio to allow millions of illegal...,1
4,democrats say trump agrees to work on immigrat...,0
...,...,...
9895,wikileaks admits to screwing up immensely with...,1
9896,trump consults republican senators on fed chie...,0
9897,trump lawyers say judge lacks jurisdiction for...,0
9898,watch right wing pastor falsely credits trump ...,1


In [10]:
# Splitting Data to train and test

x_train, x_test, y_train, y_test = train_test_split(data['Text'], data['is_fake'], test_size=0.3, random_state=0)


x_train.head()


1247    russian lawmaker says u s sanctions hits joint...
9865    turkey summons u s consulate worker for questi...
2718    trump wants 4 billion more for missile defense...
7026    white house spokesman does not rule out trump ...
2853    trump says concerns about iran driving israel ...
Name: Text, dtype: object

In [11]:
# Saving data

x_train_path = './x_train.pkl'
x_test_path = './x_test.pkl'
y_train_path = './y_train.pkl'
y_test_path = './y_test.pkl'

with open(x_train_path, 'wb') as file:
    pickle.dump(x_train, file)

with open(x_test_path, 'wb') as file:
    pickle.dump(x_test, file)

with open(y_train_path, 'wb') as file:
    pickle.dump(y_train, file)

with open(y_test_path, 'wb') as file:
    pickle.dump(y_test, file)

In [5]:
# it needs to be becuase vectorizer works only per fitting
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
pickle.dump(vectorizer, open("./vectorizer.pickle", "wb")) #Save vectorizer
