Importing dependencies

In [42]:
import numpy as np #for conversion to array
import pandas as pd #for dataframes
import re #for regular expression as we are converting text to numbers
import nltk #for natural language processing
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [24]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [25]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

these stopwords dont actually contribute anything to our data thats why we use these to remove these words from the dataset

Data preprocessing

In [26]:
#loading the dataset
news_data = pd.read_csv('/content/train.csv')
#printing the first five rows
news_data.head()
#the shape of dataset in form(rows,colums)
news_data.shape

(20800, 5)

0-> real news

1-> fake news

In [27]:
news_data['label'].value_counts()

#checking for missing values
news_data.isnull().sum()

#in case of numerical dataset we can replace the value with mean value
#but since this is a text dataset so we will fill with empty string
news_data = news_data.fillna('')

we will now merge the author name and news title

In [28]:
#lets merge the author and news title
news_data['content'] = news_data['author']+' '+news_data['title']
news_data.shape

(20800, 6)

Seperating features and target

In [29]:
X = news_data.drop(columns='label',axis=1)
Y = news_data['label']
print(X)
print(Y)
#

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

here we dont have to do anything to Y as it only has numerical values

but in case of x we have to process the text data

we will do it by Stemming:

Stemming is the process of reducing a word to its keyword or rootword

we will be using porter stemmer


In [30]:
#loading the stemmer function
port_stem = PorterStemmer()
port_stem.stem('enjoying') #example

'enjoy'

for the stemming function

first remove all punctuation

then convert all words to lowercase

then split all words

all the words which arenot in stopwords then they go through stemming

then join them




In [31]:
#making a stemming function
def stemming(content):
  stemmed_content = re.sub('[^a-zA-z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [33]:
news_data['content'] = news_data['content'].apply(stemming)

In [34]:
print(news_data['content'])

0        darrel lucu hou dem aid even see comey letter ...
1        daniel j flynn flynn hillari clinton big woman...
2                   consortiumnew com truth might get fire
3        jessica purkiss civilian kill singl us airstri...
4        howard portnoy iranian woman jail fiction unpu...
                               ...                        
20795    jerom hudson rapper trump poster child white s...
20796    benjamin hoffman n f l playoff schedul matchup...
20797    michael j de la merc rachel abram maci said re...
20798    alex ansari nato russia hold parallel exerci b...
20799                            david swanson keep f aliv
Name: content, Length: 20800, dtype: object


In [37]:
X = news_data['content'].values
Y = news_data['label'].values
# print(X)
# print(Y)

print(type(X))
print(type(Y))

print(X.shape)
print(Y.shape)

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(20800,)
(20800,)


converting the textual data to feature vectors

load TfidVectorizer

term frequeny and Inverse docuemnt frequency

fit the vectorizer with X

then transform X

In [38]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)
print(X)

  (0, 15566)	0.28485063562728646
  (0, 13357)	0.2565896679337957
  (0, 8829)	0.3635963806326075
  (0, 8553)	0.29212514087043684
  (0, 7619)	0.24785219520671603
  (0, 6938)	0.21874169089359144
  (0, 4924)	0.233316966909351
  (0, 3762)	0.2705332480845492
  (0, 3570)	0.3598939188262559
  (0, 2936)	0.2468450128533713
  (0, 2466)	0.3676519686797209
  (0, 264)	0.27010124977708766
  (1, 16671)	0.3025156488372128
  (1, 6752)	0.19152496072048605
  (1, 5448)	0.7186013955384664
  (1, 3538)	0.2653147533915268
  (1, 2793)	0.19208753385709676
  (1, 2207)	0.36915639258038363
  (1, 1878)	0.15614790568229528
  (1, 1482)	0.2957471154505952
  (2, 15491)	0.41544962664721613
  (2, 9538)	0.49351492943649944
  (2, 5911)	0.3474613386728292
  (2, 5334)	0.3866530551182615
  (2, 3075)	0.46097489583229645
  :	:
  (20797, 13006)	0.2483705036831893
  (20797, 12235)	0.27276402145717243
  (20797, 12030)	0.24790022252744132
  (20797, 10222)	0.0804189541935242
  (20797, 9506)	0.17463635692029988
  (20797, 9437)	0.29394

Splitting our data into training and test dataset

Stratify :- When you use stratify=Y, the split is performed in such a way that each subset (training and test sets) has approximately the same distribution of classes as the original dataset. This means that if your original dataset has a certain percentage of each class, the training and test sets will maintain that same percentage.


When you don't specify the stratify parameter, or explicitly set it to None, train_test_split will randomly sample the data to create the training and test sets without considering the class distribution.
This can lead to imbalanced classes in the training or test set if the class distribution in the dataset is skewed. For instance, if one class is underrepresented, you might end up with very few samples of that class in one of the splits, which can impact model training and evaluation.


The stratify parameter is specifically designed to work with the target variable Y and not with the features X. If you pass X to the stratify parameter, it will cause an error because stratify expects a 1D array-like structure that represents class labels, not a 2D feature matrix.

In [40]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

Training the model: Logistic Regression

In logistic regression, a graph is plotted which uses a sigmoid function which is
Y = 1/(1+e^-z)

 z = w.x+b

 X - input features

Y - Prediction Probability

w - weights -> how important will that particular column be

b - biases
which gives us the Y-value if this value is greater than the threshold value of 0.5
then the label is 1 which in our case is that the news is false and if the threshold value is less than 0.5
then the label is 0 which in our case is that the news is real


1st -> take model from LR

2nd -> fit the training models in LR

3rd-> find accuracy score

In [43]:
model = LogisticRegression()
model.fit(X_train,Y_train)

In [44]:
#finding accuracy score for X_train
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)
# This line calculates the accuracy score of the model's predictions on the training data.
# Details: accuracy_score is a function from the sklearn.metrics module that compares the predicted labels (X_train_prediction)
#  to the actual labels (Y_train). It computes the ratio of the number of correct predictions to the total number of predictions,
#  giving a measure of how well the model performed on the training set. This ratio is stored in training_data_accuracy.
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9865384615384616


In [46]:
#finding accuracy score for X_test
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9790865384615385


lets make and test our predictive system

In [50]:
X_new = X_test[1]


prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [51]:
print(Y_test[1])

0
