In [170]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [171]:
import pickle

In [172]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Maitreyee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [173]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

Data Pre-processing

In [174]:
# loading the dataset to a pandas data frame
news_dataset= pd.read_csv('train.csv')

In [175]:
news_dataset.shape


(20800, 5)

In [176]:
#printing first five rows of the dataset
news_dataset.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [177]:
# checking the number of missing values in the dataset
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [178]:
news_dataset_cleaned= news_dataset.dropna(subset=['text', 'title'])

In [179]:
# checking the number of missing values in the dataset
news_dataset_cleaned.isnull().sum()

id           0
title        0
author    1918
text         0
label        0
dtype: int64

In [180]:
news_dataset_cleaned['label'].value_counts()

label
0    10387
1     9816
Name: count, dtype: int64

In [181]:
#replacing the null values with empty string
news_dataset_cleaned = news_dataset_cleaned.fillna('')

In [182]:
# checking the number of missing values in the dataset
news_dataset_cleaned.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [183]:
news_dataset_cleaned['content'] =  news_dataset_cleaned['title']+' '+ news_dataset_cleaned['text']

In [184]:
print(news_dataset_cleaned['content'])

0        House Dem Aide: We Didn’t Even See Comey’s Let...
1        FLYNN: Hillary Clinton, Big Woman on Campus - ...
2        Why the Truth Might Get You Fired Why the Trut...
3        15 Civilians Killed In Single US Airstrike Hav...
4        Iranian woman jailed for fictional unpublished...
                               ...                        
20795    Rapper T.I.: Trump a ’Poster Child For White S...
20796    N.F.L. Playoffs: Schedule, Matchups and Odds -...
20797    Macy’s Is Said to Receive Takeover Approach by...
20798    NATO, Russia To Hold Parallel Exercises In Bal...
20799    What Keeps the F-35 Alive   David Swanson is a...
Name: content, Length: 20203, dtype: object


In [185]:
#seperating the data & label
X = news_dataset_cleaned.drop(columns='label', axis=1)
Y=news_dataset_cleaned['label']

In [186]:
print(X)
print(Y)

          id                                              title  \
0          0  House Dem Aide: We Didn’t Even See Comey’s Let...   
1          1  FLYNN: Hillary Clinton, Big Woman on Campus - ...   
2          2                  Why the Truth Might Get You Fired   
3          3  15 Civilians Killed In Single US Airstrike Hav...   
4          4  Iranian woman jailed for fictional unpublished...   
...      ...                                                ...   
20795  20795  Rapper T.I.: Trump a ’Poster Child For White S...   
20796  20796  N.F.L. Playoffs: Schedule, Matchups and Odds -...   
20797  20797  Macy’s Is Said to Receive Takeover Approach by...   
20798  20798  NATO, Russia To Hold Parallel Exercises In Bal...   
20799  20799                          What Keeps the F-35 Alive   

                                          author  \
0                                  Darrell Lucus   
1                                Daniel J. Flynn   
2                             Consortiu

stemming:
processs  of reducing a word to its root word.
eg:
actor,actress,acting for all this words root word in act

In [187]:
port_stem = PorterStemmer()

In [188]:
def stemming(content):   #stemming is a function here
  stemmed_content = re.sub('[^a-zA-z]',' ',content)       #removes everything that is not btw aA-zZ(all numbers and punctuations replaced by space)
  stemmed_content = stemmed_content.lower()                #converts all uc to lc to avoid problem during processing
  stemmed_content = stemmed_content.split()                #all words and text will be splitted and converted to list
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)
  return stemmed_content

In [189]:
#news_dataset_cleaned['content'] = news_dataset_cleaned['content'].apply(stemming)

###############
# commenting for now because the output is already saved in a .pkl file
###############

In [190]:
'''import pickle

# Assuming 'fake_news_data_cleaned' is your DataFrame containing the stemmed content

# Define the output file path
output_file = "stemmed_news_dataset.pkl"

# Save the DataFrame containing the stemmed content using pickle
with open(output_file, "wb") as f:
    pickle.dump(news_dataset_cleaned, f)

print("Stemmed fake news data saved to:", output_file)
'''
#############
# commenting for now because the output is already saved in a .pkl file
##############

'import pickle\n\n# Assuming \'fake_news_data_cleaned\' is your DataFrame containing the stemmed content\n\n# Define the output file path\noutput_file = "stemmed_news_dataset.pkl"\n\n# Save the DataFrame containing the stemmed content using pickle\nwith open(output_file, "wb") as f:\n    pickle.dump(news_dataset_cleaned, f)\n\nprint("Stemmed fake news data saved to:", output_file)\n'

In [191]:
import pickle

# Define the path to the pickle file
pickle_file_path = "stemmed_news_dataset.pkl"

# Load the DataFrame from the pickle file
with open(pickle_file_path, "rb") as f:
    stemmed_news_dataset = pickle.load(f)

# Now you can use 'stemmed_fake_news_data' as your DataFrame containing the stemmed content
# For example, you can access columns, perform further analysis, or save it to another format
print(stemmed_news_dataset.head())


   id                                              title              author  \
0   0  House Dem Aide: We Didn’t Even See Comey’s Let...       Darrell Lucus   
1   1  FLYNN: Hillary Clinton, Big Woman on Campus - ...     Daniel J. Flynn   
2   2                  Why the Truth Might Get You Fired  Consortiumnews.com   
3   3  15 Civilians Killed In Single US Airstrike Hav...     Jessica Purkiss   
4   4  Iranian woman jailed for fictional unpublished...      Howard Portnoy   

                                                text  label  \
0  House Dem Aide: We Didn’t Even See Comey’s Let...      1   
1  Ever get the feeling your life circles the rou...      0   
2  Why the Truth Might Get You Fired October 29, ...      1   
3  Videos 15 Civilians Killed In Single US Airstr...      1   
4  Print \nAn Iranian woman has been sentenced to...      1   

                                             content  
0  hous dem aid even see comey letter jason chaff...  
1  flynn hillari clinton big w

In [192]:
print(stemmed_news_dataset['content'])

0        hous dem aid even see comey letter jason chaff...
1        flynn hillari clinton big woman campu breitbar...
2        truth might get fire truth might get fire octo...
3        civilian kill singl us airstrik identifi video...
4        iranian woman jail fiction unpublish stori wom...
                               ...                        
20795    rapper trump poster child white supremaci rapp...
20796    n f l playoff schedul matchup odd new york tim...
20797    maci said receiv takeov approach hudson bay ne...
20798    nato russia hold parallel exercis balkan nato ...
20799    keep f aliv david swanson author activist jour...
Name: content, Length: 20203, dtype: object


In [193]:
#num_rows = int(len(stemmed_news_dataset) * 0.8)

In [194]:
#seperating the data and the label
X = stemmed_news_dataset['content'].values
Y = stemmed_news_dataset['label'].values

In [195]:
print(X)

['hous dem aid even see comey letter jason chaffetz tweet hous dem aid even see comey letter jason chaffetz tweet darrel lucu octob subscrib jason chaffetz stump american fork utah imag courtesi michael jolley avail creativ common licens apolog keith olbermann doubt worst person world week fbi director jame comey accord hous democrat aid look like also know second worst person well turn comey sent infam letter announc fbi look email may relat hillari clinton email server rank democrat relev committe hear comey found via tweet one republican committe chairmen know comey notifi republican chairmen democrat rank member hous intellig judiciari oversight committe agenc review email recent discov order see contain classifi inform long letter went oversight committe chairman jason chaffetz set polit world ablaz tweet fbi dir inform fbi learn exist email appear pertin investig case reopen jason chaffetz jasoninthehous octob cours know case comey actual say review email light unrel case know an

In [196]:
print(Y)

[1 0 1 ... 0 1 1]


In [197]:
Y.shape

(20203,)

In [198]:
#converting the textual data to numerical data
vectorizer = TfidfVectorizer()
vectorizer.fit(X)

X=vectorizer.transform(X)

In [199]:
print(X)

  (0, 109850)	0.05001417052334059
  (0, 109798)	0.01915628925310019
  (0, 108846)	0.04478795063167457
  (0, 108842)	0.09611164254336423
  (0, 108798)	0.03794631396404217
  (0, 108758)	0.011387610991409091
  (0, 108113)	0.017291964598269833
  (0, 107299)	0.017297834920508945
  (0, 107211)	0.01266591458237952
  (0, 107126)	0.029608099123504796
  (0, 107047)	0.012966066245168469
  (0, 106844)	0.011872414352282803
  (0, 105998)	0.02609196335184901
  (0, 105961)	0.03174488605043788
  (0, 104949)	0.02182175419223626
  (0, 103529)	0.06646832547938372
  (0, 102837)	0.033699691105283196
  (0, 102582)	0.016567056391623123
  (0, 101809)	0.03876549054522208
  (0, 101168)	0.011154955480457762
  (0, 101158)	0.04359773862590039
  (0, 101105)	0.13782544837424862
  (0, 100957)	0.07197372452095199
  (0, 99668)	0.04010226844336061
  (0, 99099)	0.027514323610713202
  :	:
  (20202, 7569)	0.010619425755705077
  (20202, 7242)	0.028186435237583223
  (20202, 6948)	0.03996247007915948
  (20202, 6910)	0.02551695

Splitting the dataset into training and text data

In [200]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify=Y, random_state=2)

Training the Model: LogisticRegression

In [201]:
model = LogisticRegression()

In [202]:
model.fit(X_train, Y_train)

Evaluation


accuracy score

In [203]:
# accuracy score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [204]:
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.9764261848781092


In [205]:
#accuracy score on the test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [206]:
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.9505073001732245


Making a Predicitive System

In [207]:
X_new = X_test[15]

prediction = model.predict(X_new)
print(prediction)

if (prediction[0]==0):
  print('The news is Real')
else:
  print('The news is Fake')

[0]
The news is Real


In [208]:
print (Y_test[15])

0


Precision for dataset 1 using Logistic Regression 

In [209]:
from sklearn.metrics import precision_score

# Assuming you have already split your data into training and testing sets and trained your model

# Make predictions on the testing data
y_test_prediction = model.predict(X_test)

# Evaluate precision on testing data
test_precision = precision_score(Y_test, y_test_prediction)
print("Testing Precision:", test_precision)

Testing Precision: 0.9499744767738643


F1 score and recall value for dataset 1 using Logistic Regression


In [210]:
from sklearn.metrics import f1_score, recall_score

# Assuming you have true labels 'y_true' and predicted labels 'y_pred'

# Calculate F1 score
f1 = f1_score(Y_test, y_test_prediction)
print("Testing F1 score: ", f1)
# Calculate recall
recall = recall_score(Y_test, y_test_prediction)
print("Testing recall: ", recall)

Testing F1 score:  0.9490056093829679
Testing recall:  0.9480387162506367


Cross- validation

In [212]:
#performing cross validation
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# Define k-fold cross-validation
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform cross-validation
scores = cross_val_score(model, X, Y, cv=k_fold, scoring='accuracy')

# Print average performance
print("Average Accuracy:", np.mean(scores))

Average Accuracy: 0.9542643718714856
