In [1]:

import pandas as pd
import spacy
import random

from spacy.util import minibatch, compounding
from spacy.training import Example

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support



In [2]:
df = pd.read_csv("labeled_news.csv")

df = df.dropna(subset=["text"])
df = df[df["language"] == "english"]

df.head()


Unnamed: 0,author,published,title,text,language,site_url,main_img_url,type,label,title_without_stopwords,text_without_stopwords,hasImage
0,Barracuda Brigade,2016-10-26T21:41:00.000+03:00,muslims busted they stole millions in govt ben...,print they should pay all the back all the mon...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,muslims busted stole millions govt benefits,print pay back money plus interest entire fami...,1.0
1,reasoning with facts,2016-10-29T08:47:11.259+03:00,re why did attorney general loretta lynch plea...,why did attorney general loretta lynch plead t...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,attorney general loretta lynch plead fifth,attorney general loretta lynch plead fifth bar...,1.0
2,Barracuda Brigade,2016-10-31T01:41:49.479+02:00,breaking weiner cooperating with fbi on hillar...,red state \nfox news sunday reported this mor...,english,100percentfedup.com,http://bb4sp.com/wp-content/uploads/2016/10/Fu...,bias,Real,breaking weiner cooperating fbi hillary email ...,red state fox news sunday reported morning ant...,1.0
3,Fed Up,2016-11-01T05:22:00.000+02:00,pin drop speech by father of daughter kidnappe...,email kayla mueller was a prisoner and torture...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,pin drop speech father daughter kidnapped kill...,email kayla mueller prisoner tortured isis cha...,1.0
4,Fed Up,2016-11-01T21:56:00.000+02:00,fantastic trumps point plan to reform healthc...,email healthcare reform to make america great ...,english,100percentfedup.com,http://100percentfedup.com/wp-content/uploads/...,bias,Real,fantastic trumps point plan reform healthcare ...,email healthcare reform make america great sin...,1.0


In [3]:
nlp_base = spacy.load("en_core_web_sm")


In [4]:
TRAIN_DATA = []

sample_size = min(2000, len(df))
texts = df["text"].sample(sample_size, random_state=42)

for text in texts:
    doc = nlp_base(text)
    entities = [
        (ent.start_char, ent.end_char, ent.label_)
        for ent in doc.ents
    ]
    if entities:
        TRAIN_DATA.append((text, {"entities": entities}))

print("NER training samples:", len(TRAIN_DATA))
print(TRAIN_DATA[0])


NER training samples: 1868
('video women on the street explain why they hate hillary by chase stephens november   \nmedia analyst and youtuber mark dice took to the streets once again to ask the public what they think of this years presidential election specifically what they think about ol crooked hillary \nin the video below titled why women hate hillary clinton dice gives women the opportunity to voice their concerns about the character and integrity of the former secretary of state and the results are brutally honest \n\n\nwhats wrong with hillary why do everybody hate her dice asks the first woman who replies because shes a criminal \nshes a liar and i hate her voice another woman tells him adding her voice drives me crazy \na few more ladies tell mark that hillary is a liar shes terrible with one woman adding she lost to obama eight years ago so why is she still running its kind of absurd to me \nthe women also had problems with hillarys constant scandals dirty money bad policies

In [5]:
train_data, test_data = train_test_split(
    TRAIN_DATA, test_size=0.2, random_state=42
)


In [6]:
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

for _, ann in TRAIN_DATA:
    for ent in ann["entities"]:
        ner.add_label(ent[2])


In [7]:
optimizer = nlp.initialize()
n_iter = 5

for epoch in range(n_iter):
    random.shuffle(train_data)
    losses = {}

    batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))

    for batch in batches:
        examples = []
        for text, annotations in batch:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            examples.append(example)

        nlp.update(
            examples,
            drop=0.3,
            sgd=optimizer,
            losses=losses
        )

    print(f"Epoch {epoch+1}/{n_iter} | Loss: {losses['ner']:.4f}")


Epoch 1/5 | Loss: 66719.2656
Epoch 2/5 | Loss: 30224.2461
Epoch 3/5 | Loss: 25762.7695
Epoch 4/5 | Loss: 23527.9688
Epoch 5/5 | Loss: 21641.6738


In [8]:
nlp.to_disk("news_ner_model")
print("Model saved successfully")


Model saved successfully


In [9]:
def evaluate_ner(nlp, data):
    y_true, y_pred = [], []

    for text, ann in data:
        doc = nlp(text)

        true_ents = {(s, e, l) for s, e, l in ann["entities"]}
        pred_ents = {(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents}

        for ent in true_ents:
            y_true.append(ent[2])
            y_pred.append(ent[2] if ent in pred_ents else "O")

        for ent in pred_ents - true_ents:
            y_true.append("O")
            y_pred.append(ent[2])

    p, r, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="micro"
    )
    return p, r, f1


In [10]:
precision, recall, f1 = evaluate_ner(nlp, test_data)

print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-score:  {f1:.4f}")


Precision: 0.6721
Recall:    0.6721
F1-score:  0.6721
