## Importing Required Libraries 
and installing Dependencies

In [None]:
import numpy as np
import pandas as pd
import re #re = regular expression ; for searching words in a text or paragraph
import nltk #nltk = natural language toolkit
from nltk.corpus import stopwords #stopwords are words which do not add much value to the sentence or paragraph
from nltk.stem.porter import PorterStemmer #to get the root word for a particualr work
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import joblib

: 

In [None]:
# downloading the 'stopwords'
nltk.download('stopwords')

: 

In [None]:
#printing the stopwords for language = english
print(stopwords.words('english'))

: 

## Data Pre-processing

In [None]:
#loading the Dataset to pandas dataframe
fakenews_data = pd.read_csv('fake_news_data.csv')
fakenews_data

: 

In [None]:
fakenews_data.isnull().sum() #checking for any missing values in the dataset

: 

In [None]:
fakenews_data = fakenews_data.fillna('') #replacing the missing values with NULL-STRING
fakenews_data.isnull().sum() # now checking for any missing values in the dataset, for conformation

: 

In [None]:
#adding a new column 'content' combing the colums = 'title'+'author'
fakenews_data['content'] = fakenews_data['author'] + ' ' + fakenews_data['title']
fakenews_data['content']

: 

## "Stemming" the Dataset :
### Stemming is the process of reducing a word to its Root word
#### example: actor, actress, acting  =>  act

In [None]:
port_stem = PorterStemmer()

: 

In [None]:
def stemming (content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content) #removes everything except alphabets
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

: 

In [None]:
fakenews_data['content'] = fakenews_data['content'].apply(stemming) #applying the stemming

: 

In [None]:
fakenews_data['content']

: 

## Separating the Features,Content,Data (Input = x) from the Labels (Output = y)
#### In output , y : 0 = Accurate News  ;  1 = Fake News

In [None]:
x = fakenews_data['content'].values
y = fakenews_data['label'].values

: 

In [None]:
print(" x_shape =", x.shape, "\ty_shape =",y.shape)

: 

In [None]:
print(x)

: 

### Converting the Textual Data to Numerical Data

In [None]:
vectorizer = TfidfVectorizer()

vectorizer.fit(x)
x = vectorizer.transform(x)

: 

In [None]:
print (x)

: 

## Splitting the data into Train (80% x_train) and Test (20% x_test)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify=y, random_state=2)

: 

## Training the Model
Using LogisticRegression

In [None]:
model = LogisticRegression()

: 

In [None]:
model.fit(x_train, y_train)

: 

## Evaluating the Model and Checking its Accuracy Score

In [None]:
prediction = model.predict(x_test)
prediction

: 

In [None]:
score = accuracy_score(prediction, y_test)
print("Accuracy =",score*100,"%")

: 

## Saving the trained model for further use

In [None]:
joblib.dump(model, 'Fake News Prediction.joblib')
# load_model = joblib.load('Deart Disease Predictor.joblib')

: 

## < Building a Predictive System >

In [None]:
x_new = x_test[8]

prediction = model.predict(x_new)
print("Prediction Vaue = ",prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

: 

: 