In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Maitreyee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
true_data=pd.read_csv('True.csv')
fake_data=pd.read_csv('Fake.csv')

In [5]:
true_data.shape
fake_data.shape

(23481, 4)

In [6]:
#printing first five rows of the dataset
true_data.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [7]:
#printing first five rows of the dataset
fake_data.head()

Unnamed: 0,title,text,subject,date
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017"
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017"
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017"
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017"
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017"


In [8]:
print(true_data['text'][0])
true_data.columns

WASHINGTON (Reuters) - The head of a conservative Republican faction in the U.S. Congress, who voted this month for a huge expansion of the national debt to pay for tax cuts, called himself a “fiscal conservative” on Sunday and urged budget restraint in 2018. In keeping with a sharp pivot under way among Republicans, U.S. Representative Mark Meadows, speaking on CBS’ “Face the Nation,” drew a hard line on federal spending, which lawmakers are bracing to do battle over in January. When they return from the holidays on Wednesday, lawmakers will begin trying to pass a federal budget in a fight likely to be linked to other issues, such as immigration policy, even as the November congressional election campaigns approach in which Republicans will seek to keep control of Congress. President Donald Trump and his Republicans want a big budget increase in military spending, while Democrats also want proportional increases for non-defense “discretionary” spending on programs that support educati

Index(['title', 'text', 'subject', 'date'], dtype='object')

In [9]:
fake_data['label'] = 1
true_data['label']=0
print(true_data.head())
fake_data.head()

                                               title  \
0  As U.S. budget fight looms, Republicans flip t...   
1  U.S. military to accept transgender recruits o...   
2  Senior U.S. Republican senator: 'Let Mr. Muell...   
3  FBI Russia probe helped by Australian diplomat...   
4  Trump wants Postal Service to charge 'much mor...   

                                                text       subject  \
0  WASHINGTON (Reuters) - The head of a conservat...  politicsNews   
1  WASHINGTON (Reuters) - Transgender people will...  politicsNews   
2  WASHINGTON (Reuters) - The special counsel inv...  politicsNews   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  politicsNews   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  politicsNews   

                 date  label  
0  December 31, 2017       0  
1  December 29, 2017       0  
2  December 31, 2017       0  
3  December 30, 2017       0  
4  December 29, 2017       0  


Unnamed: 0,title,text,subject,date,label
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,News,"December 31, 2017",1
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,News,"December 31, 2017",1
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",News,"December 30, 2017",1
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",News,"December 29, 2017",1
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,News,"December 25, 2017",1


In [10]:
all_data=pd.concat([fake_data, true_data])   #concatenates two DataFrames, fake_data and true_data
random_permutation = np.random.permutation(len(all_data))  #permutation will be used to shuffle the rows of all_data
all_data= all_data.iloc[random_permutation]   #uses iloc[] to index and rearrange the rows based on the shuffled order
print(all_data.columns)  # gives you a list of all the columns present in the combined DataFrame after concatenation and shuffling
all_data.head()

Index(['title', 'text', 'subject', 'date', 'label'], dtype='object')


Unnamed: 0,title,text,subject,date,label
10273,WATCH REAGAN WARN US AND DRAW BATTLE LINES…Tru...,President Reagan Warned Of The Dangers Of Libe...,politics,"Jul 30, 2017",1
1820,Jeff Sessions’ Written Immigration Speech Put...,The Trump administration is working as hard as...,News,"April 11, 2017",1
22115,The Existential Question Of Whom To Trust,21st Century Wire says Investigative reporter ...,US_News,"May 1, 2017",1
3214,WATCH: CNN Host HUMILIATES Trump Supporter Fo...,CNN host Kate Bolduan sat quietly as a conserv...,News,"December 31, 2016",1
18408,BROKE City of Chicago Spends Taxpayer Money St...,The Windy City is under fire for turning publi...,left-news,"Jun 30, 2017",1


In [11]:
all_data.shape

(44898, 5)

In [12]:
filtered_data=all_data.loc[:, ['title', 'text', "subject", 'label']]
filtered_data.head()

Unnamed: 0,title,text,subject,label
10273,WATCH REAGAN WARN US AND DRAW BATTLE LINES…Tru...,President Reagan Warned Of The Dangers Of Libe...,politics,1
1820,Jeff Sessions’ Written Immigration Speech Put...,The Trump administration is working as hard as...,News,1
22115,The Existential Question Of Whom To Trust,21st Century Wire says Investigative reporter ...,US_News,1
3214,WATCH: CNN Host HUMILIATES Trump Supporter Fo...,CNN host Kate Bolduan sat quietly as a conserv...,News,1
18408,BROKE City of Chicago Spends Taxpayer Money St...,The Windy City is under fire for turning publi...,left-news,1


In [13]:
filtered_data.shape

(44898, 4)

In [14]:
filtered_data.isnull().sum()

title      0
text       0
subject    0
label      0
dtype: int64

In [15]:
filtered_data['content']=filtered_data['title']+' '+filtered_data['text']+' '+filtered_data['subject']
filtered_data.head()

Unnamed: 0,title,text,subject,label,content
10273,WATCH REAGAN WARN US AND DRAW BATTLE LINES…Tru...,President Reagan Warned Of The Dangers Of Libe...,politics,1,WATCH REAGAN WARN US AND DRAW BATTLE LINES…Tru...
1820,Jeff Sessions’ Written Immigration Speech Put...,The Trump administration is working as hard as...,News,1,Jeff Sessions’ Written Immigration Speech Put...
22115,The Existential Question Of Whom To Trust,21st Century Wire says Investigative reporter ...,US_News,1,The Existential Question Of Whom To Trust 21st...
3214,WATCH: CNN Host HUMILIATES Trump Supporter Fo...,CNN host Kate Bolduan sat quietly as a conserv...,News,1,WATCH: CNN Host HUMILIATES Trump Supporter Fo...
18408,BROKE City of Chicago Spends Taxpayer Money St...,The Windy City is under fire for turning publi...,left-news,1,BROKE City of Chicago Spends Taxpayer Money St...


In [16]:
#filtered_data['content'] = filtered_data['title']+' '+filtered_data['subject']
#filtered_data.head()

In [17]:
print(filtered_data['content'])

10273    WATCH REAGAN WARN US AND DRAW BATTLE LINES…Tru...
1820      Jeff Sessions’ Written Immigration Speech Put...
22115    The Existential Question Of Whom To Trust 21st...
3214      WATCH: CNN Host HUMILIATES Trump Supporter Fo...
18408    BROKE City of Chicago Spends Taxpayer Money St...
                               ...                        
19170    German Social Democrats vow to rebuild in oppo...
15927    HANNITY TEARS IT UP IN HIS BEST EVER RANT: ‘Hi...
16761    DISTURBING TRUTH ABOUT How The UN Decides Whic...
3226      Sarah Palin Gets Her A** Handed To Her For Ca...
9331     Combative Trump says he raised $5.6 million fo...
Name: content, Length: 44898, dtype: object


In [18]:
X= filtered_data['content']
Y = filtered_data['label']

In [19]:
print(X)
print(Y)

10273    WATCH REAGAN WARN US AND DRAW BATTLE LINES…Tru...
1820      Jeff Sessions’ Written Immigration Speech Put...
22115    The Existential Question Of Whom To Trust 21st...
3214      WATCH: CNN Host HUMILIATES Trump Supporter Fo...
18408    BROKE City of Chicago Spends Taxpayer Money St...
                               ...                        
19170    German Social Democrats vow to rebuild in oppo...
15927    HANNITY TEARS IT UP IN HIS BEST EVER RANT: ‘Hi...
16761    DISTURBING TRUTH ABOUT How The UN Decides Whic...
3226      Sarah Palin Gets Her A** Handed To Her For Ca...
9331     Combative Trump says he raised $5.6 million fo...
Name: content, Length: 44898, dtype: object
10273    1
1820     1
22115    1
3214     1
18408    1
        ..
19170    0
15927    1
16761    1
3226     1
9331     0
Name: label, Length: 44898, dtype: int64


In [20]:
port_stem = PorterStemmer()

In [21]:
def stemming(content):   #stemming is a function here
  stemmed_content = re.sub('[^a-zA-z]',' ',content)       #removes everything that is not btw aA-zZ(all numbers and punctuations replaced by space)
  stemmed_content = stemmed_content.lower()                #converts all uc to lc to avoid problem during processing
  stemmed_content = stemmed_content.split()                #all words and text will be splitted and converted to list
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [22]:

#filtered_data['content'] = filtered_data['content'].apply(stemming)

#################
# commenting to avoid running it again as output is already saved in a .pkl file 

################

In [23]:
'''import pickle

# Assuming 'fake_news_data_cleaned' is your DataFrame containing the stemmed content

# Define the output file path
output_file = "stemmed_news_dataset_LR2.pkl"

# Save the DataFrame containing the stemmed content using pickle
with open(output_file, "wb") as f:
    pickle.dump(filtered_data, f)

print("Stemmed fake news data saved to:", output_file)
'''

#############
#   commenting to avoid running it again as output is already saved in a .pkl file 

#############

'import pickle\n\n# Assuming \'fake_news_data_cleaned\' is your DataFrame containing the stemmed content\n\n# Define the output file path\noutput_file = "stemmed_news_dataset_LR2.pkl"\n\n# Save the DataFrame containing the stemmed content using pickle\nwith open(output_file, "wb") as f:\n    pickle.dump(filtered_data, f)\n\nprint("Stemmed fake news data saved to:", output_file)\n'

In [24]:
import pickle

# Define the path to the pickle file
pickle_file_path = "stemmed_news_dataset_LR2.pkl"

# Load the DataFrame from the pickle file
with open(pickle_file_path, "rb") as f:
    stemmed_news_dataset_LR2 = pickle.load(f)

# Now you can use 'stemmed_fake_news_data' as your DataFrame containing the stemmed content
# For example, you can access columns, perform further analysis, or save it to another format
print(stemmed_news_dataset_LR2.head())


                                                   title  \
7252   U.S. says war crimes probe of U.S. forces in A...   
9779   HYSTERICAL! TUCKER AND STEYN On Leftist Teache...   
7344   Trump strongly considering naming campaign chi...   
20777  WOW! VIDEO SURFACES OF BERNIE SANDERS Praising...   
12392  Tillerson says U.S. ready to talk to North Kor...   

                                                    text       subject  label  \
7252   WASHINGTON (Reuters) - An International Crimin...  politicsNews      0   
9779   Tucker Carlson and Mark Steyn  should be on to...      politics      1   
7344   WASHINGTON (Reuters) - President-elect Donald ...  politicsNews      0   
20777  Please share this everywhere! Especially to an...     left-news      1   
12392  WASHINGTON/SEOUL (Reuters) - U.S. Secretary of...     worldnews      0   

                                                 content  
7252   u say war crime probe u forc afghanistan unwar...  
9779   hyster tucker steyn leftist

In [25]:
print(filtered_data['content'])

10273    WATCH REAGAN WARN US AND DRAW BATTLE LINES…Tru...
1820      Jeff Sessions’ Written Immigration Speech Put...
22115    The Existential Question Of Whom To Trust 21st...
3214      WATCH: CNN Host HUMILIATES Trump Supporter Fo...
18408    BROKE City of Chicago Spends Taxpayer Money St...
                               ...                        
19170    German Social Democrats vow to rebuild in oppo...
15927    HANNITY TEARS IT UP IN HIS BEST EVER RANT: ‘Hi...
16761    DISTURBING TRUTH ABOUT How The UN Decides Whic...
3226      Sarah Palin Gets Her A** Handed To Her For Ca...
9331     Combative Trump says he raised $5.6 million fo...
Name: content, Length: 44898, dtype: object


In [26]:
#num_rows = int(len(filtered_data) * 0.8)

In [27]:
#seperating the data and the label
X = filtered_data['content'].values
Y = filtered_data['label'].values

In [28]:
print(X)

['WATCH REAGAN WARN US AND DRAW BATTLE LINES…Trump is Finishing the Battle Against the “Liberal Fascists” [Video] President Reagan Warned Of The Dangers Of Liberalism, And He Was Eerily Prophetic. He really knew what he was talking about when it came to liberalism is, and what is has become in our day. He did not hesitate to describe liberalism as essentially fascism in a 1975 interview with CBS s 60 Minutes, a political characterization that few, if any, public leaders seem willing to utter, let alone discuss, today.In a Dec. 14, 1975 interview with 60 Minutes correspondent Mike Wallace, Reagan discussed his political philosophy, saying that  the heart of my philosophy is much more libertarianism, than  .  Wallace then interrupted,  Well, that s the fashionable word these days, I guess. A conservative is no longer just that, he s a libertarian. Reagan continued,  It always has been. How do we call a liberal? You know, someone very profoundly once said many years ago that if fascism ev

In [29]:
print(Y)

[1 1 1 ... 1 1 0]


In [30]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X=vectorizer.transform(X)

In [31]:
print(X)

  (0, 121199)	0.030803303116334613
  (0, 120809)	0.036551758181997254
  (0, 119492)	0.06550753352090988
  (0, 119116)	0.037565090427019006
  (0, 118767)	0.0661776824793437
  (0, 118721)	0.02592674885593699
  (0, 118296)	0.02819361018794806
  (0, 118247)	0.057695104508935294
  (0, 117945)	0.039124660291396315
  (0, 117640)	0.025491894863194114
  (0, 117397)	0.043943249918857605
  (0, 117318)	0.0404514014064927
  (0, 117259)	0.062273159386208234
  (0, 117258)	0.09134804602978333
  (0, 117039)	0.19929657263211698
  (0, 115842)	0.03648816195456681
  (0, 115626)	0.03906894056745907
  (0, 114743)	0.09666445648567035
  (0, 114534)	0.03986313090638226
  (0, 111257)	0.025508596955012626
  (0, 109669)	0.048973641615422524
  (0, 109627)	0.06313198172836725
  (0, 108576)	0.0394683889458225
  (0, 108445)	0.03863994670854608
  (0, 108257)	0.09291703444684327
  :	:
  (44897, 11371)	0.03014385055825708
  (44897, 11326)	0.023363792943762302
  (44897, 11255)	0.028188906321305485
  (44897, 11039)	0.01161

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

In [33]:
model = LogisticRegression()

In [34]:
model.fit(X_train, Y_train)

In [35]:
'''# Model accuracy on the test set
test_y_hat=model.predict(X_test)
print(accuracy_score(test_y_hat, Y_test))

# Model Accuracy on  training set
train_y_hat = model.predict(X_train)
print(accuracy_score(train_y_hat,Y_train))   '''

'# Model accuracy on the test set\ntest_y_hat=model.predict(X_test)\nprint(accuracy_score(test_y_hat, Y_test))\n\n# Model Accuracy on  training set\ntrain_y_hat = model.predict(X_train)\nprint(accuracy_score(train_y_hat,Y_train))   '

In [36]:
'''def predict(input_data):

    y_hat= model.predict(input_data)
    if y_hat==0:
        return "The article is Fake"
    else:
        return 'The article is not Fake'   '''

'def predict(input_data):\n\n    y_hat= model.predict(input_data)\n    if y_hat==0:\n        return "The article is Fake"\n    else:\n        return \'The article is not Fake\'   '

In [37]:
'''print(predict(X_test[2000]))
model.predict(X_test[2000])[0]   '''

'print(predict(X_test[2000]))\nmodel.predict(X_test[2000])[0]   '

In [38]:
''' from sklearn.metrics import precision_score, recall_score, f1_score

print (precision_score(Y_train, train_y_hat,))
print (recall_score(Y_train, train_y_hat,))
print(f1_score(Y_train, train_y_hat,))


print (precision_score(Y_test, test_y_hat,))
print (recall_score(Y_test, test_y_hat,))
print(f1_score(Y_test, test_y_hat,))    '''

' from sklearn.metrics import precision_score, recall_score, f1_score\n\nprint (precision_score(Y_train, train_y_hat,))\nprint (recall_score(Y_train, train_y_hat,))\nprint(f1_score(Y_train, train_y_hat,))\n\n\nprint (precision_score(Y_test, test_y_hat,))\nprint (recall_score(Y_test, test_y_hat,))\nprint(f1_score(Y_test, test_y_hat,))    '

In [39]:
'''from sklearn.metrics import confusion_matrix
print ([['TP', 'FP'],['FN', 'TN']])
print(confusion_matrix(Y_test, test_y_hat,))
print (confusion_matrix(Y_train, train_y_hat,))    '''

"from sklearn.metrics import confusion_matrix\nprint ([['TP', 'FP'],['FN', 'TN']])\nprint(confusion_matrix(Y_test, test_y_hat,))\nprint (confusion_matrix(Y_train, train_y_hat,))    "

In [40]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [41]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9949050615290383


In [42]:
#accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [43]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9913140311804008


In [44]:
X_new = X_test[1897]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[1]
The news is Fake


In [45]:
print (Y_test[1897])

1


Precision for dataset 2 using logistic regression

In [46]:
from sklearn.metrics import precision_score

# Assuming you have already split your data into training and testing sets and trained your model

# Make predictions on the testing data
y_test_prediction = model.predict(X_test)

# Evaluate precision on testing data
test_precision = precision_score(Y_test, y_test_prediction)
print("Testing Precision:", test_precision)

Testing Precision: 0.9927443448570209


F1 score and recall for datase 2 using logistic regression

In [47]:
from sklearn.metrics import f1_score, recall_score

# Assuming you have true labels 'y_true' and predicted labels 'y_pred'

# Calculate F1 score
f1 = f1_score(Y_test, y_test_prediction)
print("Testing F1 score: ", f1)
# Calculate recall
recall = recall_score(Y_test, y_test_prediction)
print("Testing recall: ", recall)

Testing F1 score:  0.9916862076316351
Testing recall:  0.9906303236797275


Cross-validation

In [48]:
#performing cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Define k-fold cross-validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model, X, Y, cv=k_fold, scoring='accuracy')

# Print average performance
print("Average Accuracy:", np.mean(scores))

Average Accuracy: 0.9915363713053795
