In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vogie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
news_dataset = pd.read_csv('Welfake_Dataset.csv')
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [4]:
news_dataset.info

<bound method DataFrame.info of        Unnamed: 0                                              title  \
0               0  LAW ENFORCEMENT ON HIGH ALERT Following Threat...   
1               1                                                NaN   
2               2  UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...   
3               3  Bobby Jindal, raised Hindu, uses story of Chri...   
4               4  SATAN 2: Russia unvelis an image of its terrif...   
...           ...                                                ...   
72129       72129  Russians steal research on Trump in hack of U....   
72130       72130   WATCH: Giuliani Demands That Democrats Apolog...   
72131       72131  Migrants Refuse To Leave Train At Refugee Camp...   
72132       72132  Trump tussle gives unpopular Mexican leader mu...   
72133       72133  Goldman Sachs Endorses Hillary Clinton For Pre...   

                                                    text  label  
0      No comment is expected from Ba

In [5]:
news_dataset = news_dataset.drop(['Unnamed: 0', 'title'], axis=1)

In [7]:
news_dataset = news_dataset.dropna()
news_dataset.head()

Unnamed: 0,text,label
0,No comment is expected from Barack Obama Membe...,1
1,Did they post their votes for Hillary already?,1
2,"Now, most of the demonstrators gathered last ...",1
3,A dozen politically active pastors came here f...,0
4,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1


In [8]:
port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [9]:
news_dataset['text'] = news_dataset['text'].apply(stemming)

In [10]:
print(news_dataset['text'])

0        comment expect barack obama member fyf fukyofl...
1                                post vote hillari alreadi
2        demonstr gather last night exercis constitut p...
3        dozen polit activ pastor came privat dinner fr...
4        rs sarmat missil dub satan replac ss fli mile ...
                               ...                        
72129    washington reuter hacker believ work russian g...
72130    know fantasyland republican never question cit...
72131    migrant refus leav train refuge camp hungari t...
72132    mexico citi reuter donald trump comb style buf...
72133    goldman sach endors hillari clinton presid gol...
Name: text, Length: 72095, dtype: object


In [11]:
X = news_dataset['text']
Y = news_dataset['label']

In [12]:
print(X)

0        comment expect barack obama member fyf fukyofl...
1                                post vote hillari alreadi
2        demonstr gather last night exercis constitut p...
3        dozen polit activ pastor came privat dinner fr...
4        rs sarmat missil dub satan replac ss fli mile ...
                               ...                        
72129    washington reuter hacker believ work russian g...
72130    know fantasyland republican never question cit...
72131    migrant refus leav train refuge camp hungari t...
72132    mexico citi reuter donald trump comb style buf...
72133    goldman sach endors hillari clinton presid gol...
Name: text, Length: 72095, dtype: object


In [13]:
print(Y)

0        1
1        1
2        1
3        0
4        1
        ..
72129    0
72130    1
72131    0
72132    0
72133    1
Name: label, Length: 72095, dtype: int64


In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [15]:
vectorizer = TfidfVectorizer()
tfid_X_train = vectorizer.fit_transform(X_train)
tfid_X_test = vectorizer.transform(X_test)

In [16]:
model = LogisticRegression()
model.fit(tfid_X_train, Y_train)

In [17]:
X_train_prediction = model.predict(tfid_X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [18]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9587003259588044


In [19]:
X_test_prediction = model.predict(tfid_X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print(X_test_prediction)

[1 0 0 ... 0 0 1]


In [20]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9460434149386227


In [21]:
x_test_proba = model.predict_proba(tfid_X_test)
print(x_test_proba)

[[3.15146661e-02 9.68485334e-01]
 [5.00213235e-01 4.99786765e-01]
 [9.83506933e-01 1.64930672e-02]
 ...
 [9.99298994e-01 7.01005829e-04]
 [9.87417796e-01 1.25822045e-02]
 [2.86399544e-01 7.13600456e-01]]


In [22]:
X_new = tfid_X_test[3]

prediction = model.predict(X_new)
probabilities = model.predict_proba(X_new)
print(prediction)

if (prediction[0]==1):
  print('The news is Real')
else:
  print('The news is Fake')

#Pro Feature
print("Real%: " + str(probabilities[0][1]))
print("Fake%: " + str(probabilities[0][0]))

[0]
The news is Fake
Real%: 0.1366948950522414
Fake%: 0.8633051049477586


In [23]:
def fake_news_det(news):
    input_data = [news]
    vectorized_input_data = vectorizer.transform(input_data)
    prediction = model.predict(vectorized_input_data)
    if (prediction[0]==1):
        pred_final = "REAL"
    elif (prediction[0]==0):
        pred_final = "FAKE"
    print (pred_final)

In [24]:
fake_news_det("daniel greenfield shillman journal fellow free.")

REAL


In [25]:
import pickle

In [26]:
news_dataset.to_csv("Cleaned_News_Dataset.csv")

In [27]:
pickle.dump(model, open('FakeNewsmodel.pkl', 'wb'))