In [3]:
import pandas as pd

In [4]:
train = pd.read_csv("../data/fake_news/train.csv")
test = pd.read_csv("../data/fake_news/test.csv")
labels = pd.read_csv("../data/fake_news/labels.csv")

In [5]:
train.columns

Index(['id', 'title', 'author', 'text', 'label'], dtype='object')

In [6]:
train.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [7]:
train.shape

(20800, 5)

In [8]:
test.shape

(5200, 4)

In [9]:
labels.shape

(5200, 2)

### Data Exploration 

In [21]:
# some of these are not even in english
clean_train = train[~train['text'].isna()]
clean_train.shape

(20761, 5)

In [30]:
test[test['text'].isna()]

Unnamed: 0,id,title,author,text
589,21389,[Vidéo] Que trouve-t-on dans « Le Gorafi de l’...,,
626,21426,Nous avons comparé les programmes d’Alain Jupp...,,
978,21778,« J’y crois encore ! » par Alain Juppé en trai...,,
2257,23057,Horoscope du 14 novembre 2016 >> Le Gorafi,,
2892,23692,Une fan demande le remboursement de son tatoua...,,
4736,25536,Révélations – François Fillon serait de droite...,,
5092,25892,Des traces de vin rouge détectées dans le Beau...,,


In [29]:
clean_test = test[~test['text'].isna()]
clean_test.shape

(5193, 4)

In [32]:
clean_test.head()

Unnamed: 0,id,title,author,text
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning..."
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different..."
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...


In [33]:
labels.head()

Unnamed: 0,id,label
0,20800,0
1,20801,1
2,20802,0
3,20803,1
4,20804,1


In [35]:
clean_test = pd.merge(clean_test, labels, on="id")

In [38]:
clean_test.shape

(5193, 5)

In [39]:
clean_test.head()

Unnamed: 0,id,title,author,text,label
0,20800,"Specter of Trump Loosens Tongues, if Not Purse...",David Streitfeld,"PALO ALTO, Calif. — After years of scorning...",0
1,20801,Russian warships ready to strike terrorists ne...,,Russian warships ready to strike terrorists ne...,1
2,20802,#NoDAPL: Native American Leaders Vow to Stay A...,Common Dreams,Videos #NoDAPL: Native American Leaders Vow to...,0
3,20803,"Tim Tebow Will Attempt Another Comeback, This ...",Daniel Victor,"If at first you don’t succeed, try a different...",1
4,20804,Keiser Report: Meme Wars (E995),Truth Broadcast Network,42 mins ago 1 Views 0 Comments 0 Likes 'For th...,1


### MVP Pipeline

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(clean_train.text)
X_train_counts.shape

(20761, 180445)

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf

<20761x180445 sparse matrix of type '<class 'numpy.float64'>'
	with 6871545 stored elements in Compressed Sparse Row format>

In [27]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, clean_train.label)

In [28]:
# training metrics
from sklearn import metrics
preds = clf.predict(X_train_tfidf)
print(metrics.classification_report(clean_train.label, preds))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89     10387
           1       0.99      0.77      0.86     10374

    accuracy                           0.88     20761
   macro avg       0.90      0.88      0.88     20761
weighted avg       0.90      0.88      0.88     20761



In [31]:
# test set
X_new_counts = count_vect.transform(clean_test.text)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

In [40]:
# test metrics
preds = clf.predict(X_new_tfidf)
print(metrics.classification_report(clean_test.label, preds))

              precision    recall  f1-score   support

           0       0.52      0.76      0.62      2339
           1       0.69      0.43      0.53      2854

    accuracy                           0.58      5193
   macro avg       0.61      0.60      0.58      5193
weighted avg       0.61      0.58      0.57      5193



In [41]:
metrics.confusion_matrix(clean_test.label, preds)

array([[1788,  551],
       [1629, 1225]])

In [42]:
(1225 + 1788) / 5193

0.5802041209320239

In [43]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

In [44]:
text_clf.fit(clean_train.text, clean_train.label)

In [46]:
preds = text_clf.predict(clean_test.text)

In [47]:
print(metrics.classification_report(clean_test.label, preds))

              precision    recall  f1-score   support

           0       0.52      0.76      0.62      2339
           1       0.69      0.43      0.53      2854

    accuracy                           0.58      5193
   macro avg       0.61      0.60      0.58      5193
weighted avg       0.61      0.58      0.57      5193



### SVM

In [49]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
        alpha=1e-3, random_state=42,
        max_iter=5, tol=None)),
])

In [50]:
text_clf.fit(clean_train.text, clean_train.label)

In [51]:
preds = text_clf.predict(clean_test.text)

In [52]:
print(metrics.classification_report(clean_test.label, preds))

              precision    recall  f1-score   support

           0       0.60      0.66      0.63      2339
           1       0.70      0.63      0.66      2854

    accuracy                           0.65      5193
   macro avg       0.65      0.65      0.65      5193
weighted avg       0.65      0.65      0.65      5193



source: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html#loading-the-20-newsgroups-dataset