In [4]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [6]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Indumathi\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [10]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [13]:
data=pd.read_csv('C:/Users/Indumathi/Desktop/testcases/train.csv')
df=pd.DataFrame(data)

In [15]:
df.shape

(20800, 5)

In [16]:
df.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [19]:
df.describe()

Unnamed: 0,id,label
count,20800.0,20800.0
mean,10399.5,0.500625
std,6004.587135,0.500012
min,0.0,0.0
25%,5199.75,0.0
50%,10399.5,1.0
75%,15599.25,1.0
max,20799.0,1.0


In [18]:
df.isnull().sum() # no missing values

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [20]:
df=df.fillna('')

In [21]:
df.isnull().sum()

id        0
title     0
author    0
text      0
label     0
dtype: int64

In [27]:
# combineing title and author
df['content']=df['author']+'-'+df['title']


In [30]:
df['content']

0        Darrell Lucus-House Dem Aide: We Didn’t Even S...
1        Daniel J. Flynn-FLYNN: Hillary Clinton, Big Wo...
2        Consortiumnews.com-Why the Truth Might Get You...
3        Jessica Purkiss-15 Civilians Killed In Single ...
4        Howard Portnoy-Iranian woman jailed for fictio...
                               ...                        
20795    Jerome Hudson-Rapper T.I.: Trump a ’Poster Chi...
20796    Benjamin Hoffman-N.F.L. Playoffs: Schedule, Ma...
20797    Michael J. de la Merced and Rachel Abrams-Macy...
20798    Alex Ansary-NATO, Russia To Hold Parallel Exer...
20799              David Swanson-What Keeps the F-35 Alive
Name: content, Length: 20800, dtype: object

In [29]:
x=df.drop(columns='label',axis=1)
y=df['label']

In [31]:
x

Unnamed: 0,id,title,author,text,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus-House Dem Aide: We Didn’t Even S...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,"Daniel J. Flynn-FLYNN: Hillary Clinton, Big Wo..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",Consortiumnews.com-Why the Truth Might Get You...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,Jessica Purkiss-15 Civilians Killed In Single ...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,Howard Portnoy-Iranian woman jailed for fictio...
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,Jerome Hudson-Rapper T.I.: Trump a ’Poster Chi...
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,"Benjamin Hoffman-N.F.L. Playoffs: Schedule, Ma..."
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,Michael J. de la Merced and Rachel Abrams-Macy...
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...","Alex Ansary-NATO, Russia To Hold Parallel Exer..."


In [32]:
y

0        1
1        0
2        1
3        1
4        1
        ..
20795    0
20796    0
20797    0
20798    1
20799    1
Name: label, Length: 20800, dtype: int64

In [34]:
#stemming-reducing to its root word
ps=PorterStemmer()

In [38]:
def stemming(content):
    stem_content=re.sub('[^a-zA-Z]','',content)
    stem_content=stem_content.lower()
    stem_content=stem_content.split()
    stem_content=[ps.stem(word) for word in stem_content if not word in stopwords.words('english')]
    stem_content=''.join(stem_content)
    return stem_content

In [39]:
df['content']=df['content'].apply(stemming)

In [40]:
df['content']

0        darrelllucushousedemaidewedidntevenseecomeysle...
1        danieljflynnflynnhillaryclintonbigwomanoncampu...
2               consortiumnewscomwhythetruthmightgetyoufir
3        jessicapurkisscivilianskilledinsingleusairstri...
4        howardportnoyiranianwomanjailedforfictionalunp...
                               ...                        
20795    jeromehudsonrappertitrumpaposterchildforwhites...
20796    benjaminhoffmannflplayoffsschedulematchupsando...
20797    michaeljdelamercedandrachelabramsmacysissaidto...
20798    alexansarynatorussiatoholdparallelexercisesinb...
20799                          davidswansonwhatkeepsthefal
Name: content, Length: 20800, dtype: object

In [41]:
x=df['content'].values
y=df['label'].values

In [44]:
y.shape

(20800,)

In [46]:
# convertimg text to  numerical values
v=TfidfVectorizer()
v.fit(x)
x=v.transform(x)

In [49]:
x

<20800x20137 sparse matrix of type '<class 'numpy.float64'>'
	with 20659 stored elements in Compressed Sparse Row format>

In [59]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,stratify=y,random_state=2)

In [54]:
x_train

<16640x20137 sparse matrix of type '<class 'numpy.float64'>'
	with 16519 stored elements in Compressed Sparse Row format>

In [51]:
m=LogisticRegression()

In [52]:
m.fit(x_train,y_train)

LogisticRegression()

In [55]:
x_t_p=m.predict(x_train)
tda=accuracy_score(x_t_p,y_train)

In [56]:
tda

0.9927283653846154

In [60]:
x_p=m.predict(x_test)
a=accuracy_score(x_p,y_test)

In [61]:
a

0.5329326923076924

In [74]:
new=x_test[80]

p=m.predict(new)
if p==0:
    print("Real")
else:
    print("Fake")

Real


In [73]:
m.predict(x_test[80])

array([0], dtype=int64)