In [98]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords # include stopwords which add no value to context like a, the, etc.
from nltk.stem.porter import PorterStemmer # removes the prefix and suffix of the words
from sklearn.feature_extraction.text import TfidfVectorizer # used to change words to feature vectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [99]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/coral/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [100]:
# printing available stopwords in english
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [101]:
# loading the dataset to pandas dataframe
news_dataset = pd.read_csv('./data/train.csv',index_col='id')

In [102]:
news_dataset

Unnamed: 0_level_0,title,author,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It,Darrell Lucus,"House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It By Darrell Lucus on October 30, 2016 Subscribe Jason Chaffetz on the stump in American Fork, Utah ( image courtesy Michael Jolley, available under a Creative Common...",1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart",Daniel J. Flynn,"Ever get the feeling your life circles the roundabout rather than heads in a straight line toward the intended destination? [Hillary Clinton remains the big woman on campus in leafy, liberal Wellesley, Massachusetts. Everywhere else votes her mos...",0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, 2016 \nThe tension between intelligence analysts and political policymakers has always been between honest assessments and desired results, with the latter often overwhelming the former, as in the Ira...",1
3,15 Civilians Killed In Single US Airstrike Have Been Identified,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstrike Have Been Identified The rate at which civilians are being killed by American airstrikes in Afghanistan is now higher than it was in 2014 when the US was engaged in active combat operations. Pho...,1
4,Iranian woman jailed for fictional unpublished story about woman stoned to death for adultery,Howard Portnoy,"Print \nAn Iranian woman has been sentenced to six years in prison after Iran’s Revolutionary Guard searched her home and found a notebook that contained a fictional story she’d written about a woman who was stoned to death, according to the Eura...",1
...,...,...,...,...
20795,Rapper T.I.: Trump a ’Poster Child For White Supremacy’,Jerome Hudson,"Rapper T. I. unloaded on black celebrities who met with Donald Trump after the election, saying they failed to challenge the president for disrespecting and degrading black voters during the campaign. [The Atlanta — based artist told the of Th...",0
20796,"N.F.L. Playoffs: Schedule, Matchups and Odds - The New York Times",Benjamin Hoffman,"When the Green Bay Packers lost to the Washington Redskins in Week 11, dropping to Aaron Rodgers vowed to “run the table” in a march to the playoffs. With a victory over the Detroit Lions on Sunday night, the team fulfilled Rodgers’ promise. ...",0
20797,Macy’s Is Said to Receive Takeover Approach by Hudson’s Bay - The New York Times,Michael J. de la Merced and Rachel Abrams,"The Macy’s of today grew from the union of several great names in American retailing, including its namesake chain, Bloomingdale’s and Marshall Field’s. But the ambitious owner of Saks Fifth Avenue has broached the idea of taking the union even f...",0
20798,"NATO, Russia To Hold Parallel Exercises In Balkans",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Balkans 11/02/2016 \nPRESS TV \nRussia’s military and NATO forces are holding parallel military exercises in two neighboring Balkan countries. \nRussian troops will participate in war games in Serbia whi...",1


In [103]:
#  counting the number of missing values in the dataset
news_dataset.isnull().sum()

title      558
author    1957
text        39
label        0
dtype: int64

In [104]:
# replacing the null values with empty strings
news_dataset = news_dataset.fillna('null')

In [105]:
# merging the author name and news title
news_dataset['content'] = news_dataset['author'] + ' ' + news_dataset['title']

In [106]:
# we will only be focusing on author and title as this yields good accuracy score and model training will be faster
news_dataset['content']

id
0                                   Darrell Lucus House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It
1                                                           Daniel J. Flynn FLYNN: Hillary Clinton, Big Woman on Campus - Breitbart
2                                                                              Consortiumnews.com Why the Truth Might Get You Fired
3                                                   Jessica Purkiss 15 Civilians Killed In Single US Airstrike Have Been Identified
4                      Howard Portnoy Iranian woman jailed for fictional unpublished story about woman stoned to death for adultery
                                                                    ...                                                            
20795                                                         Jerome Hudson Rapper T.I.: Trump a ’Poster Child For White Supremacy’
20796                                            Benjamin Hoffman N.F.L. 

### Stemming 
Stemming is the process of reducing a word to its Root word
* example: reducing actor, actress, acting --> act

In [107]:
port_stem = PorterStemmer()

In [108]:
def stemming(content):
    '''Preprocessing step of Stemming on given text value'''
    # remove all non-alphabettic values
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    # removing all stopwords and stemming remaining words
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [109]:
news_dataset['content'] = news_dataset['content'].apply(stemming)

In [110]:
pd.set_option('display.max_colwidth',250)

In [111]:
news_dataset['content']

id
0                                darrel lucu hous dem aid even see comey letter jason chaffetz tweet
1                                     daniel j flynn flynn hillari clinton big woman campu breitbart
2                                                             consortiumnew com truth might get fire
3                                           jessica purkiss civilian kill singl us airstrik identifi
4               howard portnoy iranian woman jail fiction unpublish stori woman stone death adulteri
                                                    ...                                             
20795                                         jerom hudson rapper trump poster child white supremaci
20796                               benjamin hoffman n f l playoff schedul matchup odd new york time
20797    michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time
20798                                           alex ansari nato russia hold parallel ex

In [112]:
# seprating the data and the label
X =  news_dataset['content'].values
y = news_dataset['label'].values

In [113]:
X

array(['darrel lucu hous dem aid even see comey letter jason chaffetz tweet',
       'daniel j flynn flynn hillari clinton big woman campu breitbart',
       'consortiumnew com truth might get fire', ...,
       'michael j de la merc rachel abram maci said receiv takeov approach hudson bay new york time',
       'alex ansari nato russia hold parallel exercis balkan',
       'david swanson keep f aliv'], dtype=object)

In [114]:
y

array([1, 0, 1, ..., 0, 1, 1])

In [115]:
# converting the textual data to numerical data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

In [116]:
print(X)

  (0, 15687)	0.28485063562728646
  (0, 2483)	0.3676519686797209
  (0, 7692)	0.24785219520671603
  (0, 8630)	0.29212514087043684
  (0, 2959)	0.2468450128533713
  (0, 13474)	0.2565896679337957
  (0, 4973)	0.233316966909351
  (0, 267)	0.27010124977708766
  (0, 3792)	0.2705332480845492
  (0, 7005)	0.21874169089359144
  (0, 8909)	0.3635963806326075
  (0, 3600)	0.3598939188262559
  (1, 1894)	0.15521974226349364
  (1, 2223)	0.3827320386859759
  (1, 16800)	0.30071745655510157
  (1, 1497)	0.2939891562094648
  (1, 2813)	0.19094574062359204
  (1, 6816)	0.1904660198296849
  (1, 5503)	0.7143299355715573
  (1, 3568)	0.26373768806048464
  (2, 5389)	0.3866530551182615
  (2, 5968)	0.3474613386728292
  (2, 9620)	0.49351492943649944
  (2, 15612)	0.41544962664721613
  (2, 2943)	0.3179886800654691
  :	:
  (20797, 1287)	0.3353805680413986
  (20797, 13123)	0.24825263521976057
  (20797, 12345)	0.27263457663336677
  (20797, 14968)	0.3115945315488075
  (20797, 12139)	0.24778257724396505
  (20797, 9518)	0.295420

In [117]:
# spliting training and test data
X_train,X_test, y_train,y_test = train_test_split(X,y, stratify=y, test_size = 0.2,random_state=2)

In [118]:
# Training the model: Logistic Regression
model = LogisticRegression()

In [119]:
model.fit(X_train,y_train)

In [120]:
model?

[0;31mType:[0m        LogisticRegression
[0;31mString form:[0m LogisticRegression()
[0;31mFile:[0m        ~/Downloads/Python/fake_news_prediction/venv/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py
[0;31mDocstring:[0m  
Logistic Regression (aka logit, MaxEnt) classifier.

In the multiclass case, the training algorithm uses the one-vs-rest (OvR)
scheme if the 'multi_class' option is set to 'ovr', and uses the
cross-entropy loss if the 'multi_class' option is set to 'multinomial'.
(Currently the 'multinomial' option is supported only by the 'lbfgs',
'sag', 'saga' and 'newton-cg' solvers.)

This class implements regularized logistic regression using the
'liblinear' library, 'newton-cg', 'sag', 'saga' and 'lbfgs' solvers. **Note
that regularization is applied by default**. It can handle both dense
and sparse input. Use C-ordered arrays or CSR matrices containing 64-bit
floats for optimal performance; any other input format will be converted
(and copied).

The 'newto

In [121]:
f'Accuracy Score of test data: {model.score(X_test,y_test)}'

'Accuracy Score of test data: 0.984375'

In [122]:
test_dataset = pd.read_csv('./data/test.csv')

In [123]:
news_dataset.isnull().sum()

title      0
author     0
text       0
label      0
content    0
dtype: int64

In [124]:
test_dataset = test_dataset.fillna('null')

In [125]:
test_dataset['content'] = test_dataset['author'] + ' ' + test_dataset['title']

In [126]:
test_dataset['content']

0                  David Streitfeld Specter of Trump Loosens Tongues, if Not Purse Strings, in Silicon Valley - The New York Times
1                                                                     null Russian warships ready to strike terrorists near Aleppo
2                               Common Dreams #NoDAPL: Native American Leaders Vow to Stay All Winter, File Lawsuit Against Police
3                                Daniel Victor Tim Tebow Will Attempt Another Comeback, This Time in Baseball - The New York Times
4                                                                          Truth Broadcast Network Keiser Report: Meme Wars (E995)
                                                                   ...                                                            
5195                                                   Jody Rosen The Bangladeshi Traffic Jam That Never Ends - The New York Times
5196    Sheryl Gay Stolberg John Kasich Signs One Abortion Bill in Ohio but Vetoes 

In [127]:
test_dataset['content'] = test_dataset['content'].apply(stemming)

In [128]:
df1 = vectorizer.transform(test_dataset['content'].values)

In [129]:
print(df1)

  (0, 16997)	0.08864595251126647
  (0, 16089)	0.3205594987716609
  (0, 15583)	0.11032513315099028
  (0, 15369)	0.38547947461491155
  (0, 15296)	0.08697876504498342
  (0, 14606)	0.3970837204511862
  (0, 14592)	0.34467447913058513
  (0, 13851)	0.32956045112215354
  (0, 12031)	0.38547947461491155
  (0, 10306)	0.08568694729690247
  (0, 8842)	0.36912420870480717
  (0, 3623)	0.209228842739656
  (1, 16474)	0.4846399248408354
  (1, 15143)	0.34963083036265247
  (1, 14604)	0.3501384489595611
  (1, 13049)	0.300608784796976
  (1, 12301)	0.37794791352263385
  (1, 10532)	0.174575212065545
  (1, 10219)	0.3723739651765699
  (1, 347)	0.3434186232983093
  (2, 16747)	0.3016628915147435
  (2, 16355)	0.2909537422615255
  (2, 14458)	0.30637021801826614
  (2, 11599)	0.21447077225895875
  (2, 10419)	0.35982822390508096
  :	:
  (5196, 1517)	0.19940396943572683
  (5196, 41)	0.2656503884474958
  (5197, 16997)	0.11797520134546996
  (5197, 15337)	0.33413616793311524
  (5197, 15296)	0.11575641107424592
  (5197, 148

In [143]:
model.predict(df1)

array([0, 1, 1, ..., 0, 1, 0])

In [144]:
test_dataset['label'] = model.predict(df1)

In [145]:
result_df = test_dataset[['id','label']]

In [146]:
result_df.set_index('id',inplace=True)

In [148]:
result_df

Unnamed: 0_level_0,label
id,Unnamed: 1_level_1
20800,0
20801,1
20802,1
20803,0
20804,1
...,...
25995,0
25996,0
25997,0
25998,1


In [147]:
result_df.to_csv('./data/predictions.csv')