In [2]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer

# Overview

### Goal

Broad: Train a machine learning model that can determine whether an article online is real or fake.  

Specific Objective: 

Clean our data correctly (learn methods such as Stemming and Lemmatization)

Perform Feature Vectorization (learn TF-IDF, N-gram, Sentimental analysis)

Train the models (learn fine tuning models: hyper-parameters)

Evaluation (Determine which one of the models are the best)Real world testing by deploying the models (Learn how to deploy the model)

### Dataset

train.csv: A full training dataset with the following attributes:

id: unique id for a news article

title: the title of a news article

author: author of the news articletext: the text of the article; could be incomplete

label: a label that marks the article as potentially unreliable

1: unreliable

0: reliable

Session 1:

Stuff to do 

1. Cleaning data 

In [3]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [4]:
df['author'].fillna('Unknown', inplace=True)
df['title'].fillna('Ambiguous', inplace=True)
df['text'].fillna('Ambiguous', inplace=True)
df.drop_duplicates(inplace=True)
df

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \r\nAn Iranian woman has been sentenced ...,1
...,...,...,...,...,...
20795,20795,Rapper T.I.: Trump a ’Poster Child For White S...,Jerome Hudson,Rapper T. I. unloaded on black celebrities who...,0
20796,20796,"N.F.L. Playoffs: Schedule, Matchups and Odds -...",Benjamin Hoffman,When the Green Bay Packers lost to the Washing...,0
20797,20797,Macy’s Is Said to Receive Takeover Approach by...,Michael J. de la Merced and Rachel Abrams,The Macy’s of today grew from the union of sev...,0
20798,20798,"NATO, Russia To Hold Parallel Exercises In Bal...",Alex Ansary,"NATO, Russia To Hold Parallel Exercises In Bal...",1


In [4]:
vectorizer = CountVectorizer()
test = df['text']
X = vectorizer.fit_transform(test)
vectorizer.get_feature_names_out()
print(X.toarray())

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(1, 2), max_features=10)
test = df['text']
X = vectorizer.fit_transform(test)
text_bag = vectorizer.get_feature_names_out()
print(X.toarray())
print(text_bag)

[[16  4 15 ... 21 41 20]
 [15 10 15 ... 12 49 17]
 [31 17 16 ... 23 82 36]
 ...
 [16  8 11 ...  9 47 22]
 [12  0 12 ...  0 10  6]
 [47 19 11 ...  8 54 19]]
['and' 'for' 'in' 'is' 'it' 'of' 'on' 'that' 'the' 'to']


In [6]:
from sklearn.neighbors import KNeighborsClassifier
knn_x_train, knn_x_test, knn_y_train, knn_y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)
knn = KNeighborsClassifier()
knn.fit(knn_x_train, knn_y_train)

knn_y_pre = knn.predict(knn_x_test)
print(knn_y_test)
print(knn_y_pre)


14649    1
9231     1
6473     0
18736    0
12347    1
        ..
751      0
2081     1
13241    1
18786    0
14133    1
Name: label, Length: 4160, dtype: int64
[1 0 0 ... 1 0 1]


ngram model

In [10]:
vectorizer = CountVectorizer(analyzer="char", ngram_range=(2, 5), max_features=10, tokenizer=None, preprocessor= None)
test = df['text']
X = vectorizer.fit_transform(test)
text_bag = vectorizer.get_feature_names_out()
print(X.toarray())
print(text_bag)

[[ 88 132  88 ...  96 105 110]
 [ 79 114  76 ...  72  65  89]
 [150 210 151 ... 151 117 176]
 ...
 [ 87 103  61 ... 118  79  78]
 [ 30  29  12 ...  34  13  21]
 [137 132  86 ... 126  69 111]]
[' a' ' t' ' th' 'd ' 'e ' 'he' 'in' 's ' 't ' 'th']


In [None]:

X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)
gnb = GaussianNB()
y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=29de9796-73f8-4fbc-85bc-ba74d1c9abf6' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>