In [16]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [17]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vogie\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [18]:
news_dataset = pd.read_csv('Cleaned_News_Dataset.csv')
news_dataset.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,comment expect barack obama member fyf fukyofl...,1
1,1,post vote hillari alreadi,1
2,2,demonstr gather last night exercis constitut p...,1
3,3,dozen polit activ pastor came privat dinner fr...,0
4,4,rs sarmat missil dub satan replac ss fli mile ...,1


In [19]:
news_dataset.info

<bound method DataFrame.info of        Unnamed: 0                                               text  label
0               0  comment expect barack obama member fyf fukyofl...      1
1               1                          post vote hillari alreadi      1
2               2  demonstr gather last night exercis constitut p...      1
3               3  dozen polit activ pastor came privat dinner fr...      0
4               4  rs sarmat missil dub satan replac ss fli mile ...      1
...           ...                                                ...    ...
72090       72129  washington reuter hacker believ work russian g...      0
72091       72130  know fantasyland republican never question cit...      1
72092       72131  migrant refus leav train refuge camp hungari t...      0
72093       72132  mexico citi reuter donald trump comb style buf...      0
72094       72133  goldman sach endors hillari clinton presid gol...      1

[72095 rows x 3 columns]>

In [20]:
news_dataset = news_dataset.drop(['Unnamed: 0'], axis=1)

In [21]:
news_dataset = news_dataset.dropna()
news_dataset.head()

Unnamed: 0,text,label
0,comment expect barack obama member fyf fukyofl...,1
1,post vote hillari alreadi,1
2,demonstr gather last night exercis constitut p...,1
3,dozen polit activ pastor came privat dinner fr...,0
4,rs sarmat missil dub satan replac ss fli mile ...,1


In [8]:
port_stem = PorterStemmer()
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [9]:
news_dataset['text'] = news_dataset['text'].apply(stemming)

In [22]:
print(news_dataset['text'])

0        comment expect barack obama member fyf fukyofl...
1                                post vote hillari alreadi
2        demonstr gather last night exercis constitut p...
3        dozen polit activ pastor came privat dinner fr...
4        rs sarmat missil dub satan replac ss fli mile ...
                               ...                        
72090    washington reuter hacker believ work russian g...
72091    know fantasyland republican never question cit...
72092    migrant refus leav train refuge camp hungari t...
72093    mexico citi reuter donald trump comb style buf...
72094    goldman sach endors hillari clinton presid gol...
Name: text, Length: 71302, dtype: object


In [23]:
X = news_dataset['text']
Y = news_dataset['label']

In [24]:
print(X)

0        comment expect barack obama member fyf fukyofl...
1                                post vote hillari alreadi
2        demonstr gather last night exercis constitut p...
3        dozen polit activ pastor came privat dinner fr...
4        rs sarmat missil dub satan replac ss fli mile ...
                               ...                        
72090    washington reuter hacker believ work russian g...
72091    know fantasyland republican never question cit...
72092    migrant refus leav train refuge camp hungari t...
72093    mexico citi reuter donald trump comb style buf...
72094    goldman sach endors hillari clinton presid gol...
Name: text, Length: 71302, dtype: object


In [25]:
print(Y)

0        1
1        1
2        1
3        0
4        1
        ..
72090    0
72091    1
72092    0
72093    0
72094    1
Name: label, Length: 71302, dtype: int64


In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [27]:
vectorizer = TfidfVectorizer()
tfid_X_train = vectorizer.fit_transform(X_train)
tfid_X_test = vectorizer.transform(X_test)

In [28]:
model = LogisticRegression()
model.fit(tfid_X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
X_train_prediction = model.predict(tfid_X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [30]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9586437825423818


In [31]:
X_test_prediction = model.predict(tfid_X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print(X_test_prediction)

[0 0 0 ... 1 0 1]


In [34]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.946707804501788


In [33]:
x_test_proba = model.predict_proba(tfid_X_test)
print(x_test_proba)

[[0.74620574 0.25379426]
 [0.97456748 0.02543252]
 [0.98361241 0.01638759]
 ...
 [0.05523026 0.94476974]
 [0.99457502 0.00542498]
 [0.27278816 0.72721184]]


In [22]:
X_new = tfid_X_test[3]

prediction = model.predict(X_new)
probabilities = model.predict_proba(X_new)
print(prediction)

if (prediction[0]==1):
  print('The news is Real')
else:
  print('The news is Fake')

#Pro Feature
print("Real%: " + str(probabilities[0][1]))
print("Fake%: " + str(probabilities[0][0]))

[0]
The news is Fake
Real%: 0.1366948950522414
Fake%: 0.8633051049477586


In [35]:
def fake_news_det(news):
    input_data = [news]
    vectorized_input_data = vectorizer.transform(input_data)
    prediction = model.predict(vectorized_input_data)
    if (prediction[0]==1):
        pred_final = "REAL"
    elif (prediction[0]==0):
        pred_final = "FAKE"
    print (pred_final)

In [36]:
fake_news_det("daniel greenfield shillman journal fellow free.")

REAL


In [37]:
import pickle

In [26]:
news_dataset.to_csv("Cleaned_News_Dataset.csv")

In [38]:
pickle.dump(model, open('FakeNewsmodel.pkl', 'wb'))

In [39]:
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))