In [1]:
import numpy as np 
import pandas as pd 
import re  
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mytek\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
news_dataset=pd.read_csv('train.csv')
news_dataset.shape

(20800, 5)

In [7]:
news_dataset.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [9]:
news_dataset=news_dataset.fillna(' ')


id        0
title     0
author    0
text      0
label     0
dtype: int64

In [11]:
news_dataset['content']=news_dataset['author'] + news_dataset['title'] 
news_dataset['content']

0        Darrell LucusHouse Dem Aide: We Didn’t Even Se...
1        Daniel J. FlynnFLYNN: Hillary Clinton, Big Wom...
2        Consortiumnews.comWhy the Truth Might Get You ...
3        Jessica Purkiss15 Civilians Killed In Single U...
4        Howard PortnoyIranian woman jailed for fiction...
                               ...                        
20795    Jerome HudsonRapper T.I.: Trump a ’Poster Chil...
20796    Benjamin HoffmanN.F.L. Playoffs: Schedule, Mat...
20797    Michael J. de la Merced and Rachel AbramsMacy’...
20798    Alex AnsaryNATO, Russia To Hold Parallel Exerc...
20799               David SwansonWhat Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object

seperating the data & the label


In [12]:
X=news_dataset.drop(columns='label',axis=1)
Y=news_dataset['label']

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64


Stemming

In [15]:
port_stem=PorterStemmer()



In [16]:
def stemming(content):
    stemmed_content=re.sub('[^a-zA-z]',' ',content)
    stemmed_content=stemmed_content.lower()
    stemmed_content=stemmed_content.split()
    stemmed_content=[port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content=' '.join(stemmed_content)
    return stemmed_content


In [17]:
news_dataset['content']=news_dataset['content'].apply(stemming)

In [19]:
X=news_dataset['content'].values
Y=news_dataset['label'].values


array(['darrel lucushous dem aid even see comey letter jason chaffetz tweet',
       'daniel j flynnflynn hillari clinton big woman campu breitbart',
       'consortiumnew comwhi truth might get fire', ...,
       'michael j de la merc rachel abramsmaci said receiv takeov approach hudson bay new york time',
       'alex ansarynato russia hold parallel exercis balkan',
       'david swansonwhat keep f aliv'], dtype=object)

In [21]:
vectorizer=TfidfVectorizer()
vectorizer.fit(X)
X=vectorizer.transform(X)

In [26]:
X

<20800x28445 sparse matrix of type '<class 'numpy.float64'>'
	with 198318 stored elements in Compressed Sparse Row format>

In [27]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)

In [28]:
model=LogisticRegression()
model.fit(X_train,Y_train)


In [31]:
y_pred=model.predict(X_train)
training_data_accuracy=accuracy_score(y_pred,Y_train)
training_data_accuracy

0.9719951923076923

In [33]:
y_pred=model.predict(X_test)
testing_data_accuracy=accuracy_score(y_pred,Y_test)
testing_data_accuracy

0.9548076923076924

In [37]:
X_news=X_test[0]
prediction=model.predict(X_news)
if prediction[0]==0:
    print('the news is real')
else:
    print('the news is fake')  
print(Y_test[0])      
    

the news is fake
1
