In [None]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix

Stemming takes a word and removes the prefix and suffix and returns the root word

In [None]:
nltk.download('stopwords')

In [None]:
print(stopwords.words('english'))
#printing the stopwords

Dat preprocessing

In [None]:
news = pd.read_csv('train.csv')

In [None]:
news.shape

In [None]:
#print the first 5 rows of the dataframe
news.head()

In [None]:
#counting the no.of missing values in the dataset
news.isnull().sum()

In [None]:
#replacing the null values with empty string
news = news.fillna('')

In [None]:
news.isnull().sum()

In [None]:
#combining the author name and new title
news['content'] = news['author']+' '+news['title']


In [None]:
print(news['content'])

In [None]:
#separating the data & label
X = news.drop(columns='label', axis=1)
Y = news['label']

In [None]:
print(X)
print(Y)

Stemming: The process of reducing a word to it's root word

eg: actor, actress, acting.... the root word is act

In [None]:
port_stem = PorterStemmer()

In [None]:
#creating a function
def stemming(content):
  stemmed_content = re.sub('[˄a-zA-Z]','',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

1. Creating a stemming func, content represents i/p
2. calling the re libraries used for searching paragraphs for text. Removes everything which are not alphabets.
3. Converting all the letters into lower case
4. Splitted and converted into list
5. Taking each word and checking, using for to parse for not having the stopwords
stemming is performed
6. Stemming all the words and joining them
7. Finally, returns the stemmed content

In [None]:
news['content'] = news['content'].apply(stemming)

In [None]:
print(news['content'])

In [None]:
news.head()

In [None]:
#Separating the data and label
X = news['content'].values
Y = news['label'].values

In [None]:
print(X)

In [None]:
print(Y)

In [None]:
Y.shape

Converting textual data to numerical data

In [None]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
print(X)

#Fitting this into our machine learning model,Splitting our dataset to train & test data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)
#startify split the dataset into train and test sets in a way that preserves the same proportions of examples in each class as observed in the original dataset.

Logistic regression model

In [None]:
model = LogisticRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
#accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, y_train)

In [None]:
print('Accuracy score of the training data : ', training_data_accuracy)

In [None]:
#accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)

In [None]:
print('Accuracy score of the testing data : ', test_data_accuracy)

#Making a predictive system

In [None]:
X_new = X_test[0] #making new news prediction

prediction=model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

In [None]:
print(y_test[0])

The output is predicting the fake news result

In [None]:
X_new = X_test[1]

prediction=model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

In [None]:
print(y_test[1])

The output is predicting the original true news result