In [1]:
import spacy
import pandas as pd
import numpy as np
from newsplease import NewsPlease

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [7]:
df = pd.read_csv("/home/felipenuti/Downloads/bbc_news_classification/bbc-news-data.csv", sep = '\t') 

In [8]:
categories = pd.Categorical(df['category'])
df['code'] = categories.codes

In [9]:
code_to_cat = {code: df[df['code'] == code]['category'].unique()[0] for code in range(5)}
code_to_cat


{0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}

In [10]:
df.head(3)

Unnamed: 0,category,filename,title,content,code
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,0
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,0
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,0


In [8]:
skf = StratifiedKFold(n_splits = 5, shuffle = True)

In [9]:
def test_model(estimator, model_name, folds):
    global df
    train_acc = []
    test_acc = []

    for fold, (train_idx, test_idx) in enumerate(folds):
        train_df = df.loc[train_idx, :].reset_index(drop = True)
        test_df = df.loc[test_idx, :].reset_index(drop = True)

        vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 5), stop_words='english')

        X_train = vectorizer.fit_transform(train_df['content'])
        X_test = vectorizer.transform(test_df['content'])

        y_train = train_df['code'] #pd.get_dummies(train_df['code']).values
        y_test = test_df['code'] #pd.get_dummies(test_df['code']).values

        estimator.fit(X_train, y_train)

        y_hat_train = estimator.predict(X_train)
        y_hat_test = estimator.predict(X_test)

        #print(y_train)
        #print(y_hat_train)

        train_acc.append(accuracy_score(y_train, y_hat_train))
        test_acc.append(accuracy_score(y_test, y_hat_test))

        print("Fold {} - {}".format(fold, model_name))
        print(f"\tTrain: {train_acc[-1]}\tTest: {test_acc[-1]}")




In [11]:
def train_model(estimator, vectorizer):
    global df

    X_train = vectorizer.fit_transform(df['content'])

    y_train = df['code']

    estimator.fit(X_train, y_train)

    y_hat_train = estimator.predict(X_train)

    #print(y_train)
    #print(y_hat_train)

    acc = accuracy_score(y_train, y_hat_train)
    print(acc)

In [None]:
test_model(RidgeClassifier(), "RidgeClassifier", skf.split(df.index, df.code))

Fold 0 - RidgeClassifier
	Train: 1.0
	Test: 0.9887640449438202
Fold 1 - RidgeClassifier
	Train: 1.0
	Test: 0.9910112359550561
Fold 2 - RidgeClassifier
	Train: 1.0
	Test: 0.9797752808988764
Fold 3 - RidgeClassifier
	Train: 1.0
	Test: 0.9797752808988764
Fold 4 - RidgeClassifier
	Train: 1.0
	Test: 0.9842696629213483


In [None]:
# test_model(SVC(), "SVC", skf.split(df.index, df.code))

Fold 0 - SVC
	Train: 1.0	Test: 0.9730337078651685
Fold 1 - SVC
	Train: 1.0	Test: 0.9932584269662922
Fold 2 - SVC
	Train: 1.0	Test: 0.9887640449438202
Fold 3 - SVC
	Train: 1.0	Test: 0.9775280898876404
Fold 4 - SVC
	Train: 1.0	Test: 0.9820224719101124


In [12]:
ridgeClassifier = RidgeClassifier()
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 5), stop_words='english')
train_model(ridgeClassifier, vectorizer)
pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('ridge classifier', ridgeClassifier)
])

1.0


In [13]:
gdelt_data = pd.read_csv("~/Downloads/20220411231500.export.CSV", sep = '\t', header=None)

In [17]:
gdelt_links = gdelt_data[60]

In [14]:
import unidecode
import requests

In [15]:
# Get sentiment from BERT model
def prepare_text(text):
    return unidecode.unidecode(text.strip())

def get_category(texts, url = 'http://127.0.0.1:8080/news-classification'):
    js = { 'texts': texts }
    res = requests.post(url, json = js)
    return res.json()['ans']

def get_tweet_sentiment(tweet):
    sent = get_category(tweet['text'])
    return sent

In [14]:
i = np.random.randint(gdelt_links.size)
link = gdelt_links[i]
news = NewsPlease.from_url(link)
js = news.get_dict()
if 'maintext' in js.keys():
    pred = get_category([js['title']]) # pipe.predict([js['maintext']])[0]
    js['category'] = pred #code_to_cat[pred]
    print(js['title'])
    print(js['maintext'][:100])
    print(js['category'])

Watch: North Korean nurse ‘exposes anti-government plot’ in new propaganda film
North Korea has released its first feature-length film in five years as part of propaganda efforts t
[{'classes': {'Business': 0.0009716442436911166, 'Sci/Tech': 0.0012157652527093887, 'Sports': 0.00044998913654126227, 'World': 0.9973625540733337}, 'text': 'Watch: North Korean nurse ‘exposes anti-government plot’ in new propaganda film', 'vec': [0.9973625540733337, 0.00044998913654126227, 0.0009716442436911166, 0.0012157652527093887]}]


In [21]:
titles = NewsPlease.from_urls(gdelt_links.tolist()[:10], timeout=1)
titles = [news.get_dict() for news in titles.values()]

TypeError: strftime() missing required argument 'format' (pos 1)

In [23]:
titles[0]

{'authors': ['Gillian Flaccus', 'Associated Press'],
 'date_download': datetime.datetime(2022, 4, 13, 14, 11, 25),
 'date_modify': None,
 'date_publish': datetime.datetime(2022, 4, 11, 22, 54, 45),
 'description': 'PORTLAND, Ore. (AP) — Farms that rely on irrigation from a depleted, federally managed...',
 'filename': 'https%3A%2F%2Fwww.lmtonline.com%2Fnews%2Farticle%2FFarms-fish-on-California-Oregon-border-to-get-17073626.php.json',
 'image_url': 'https://s.hdnux.com/photos/01/25/12/44/22325520/3/rawImage.jpg',
 'language': 'en',
 'localpath': None,
 'maintext': 'PORTLAND, Ore. (AP) — Farms that rely on irrigation from a depleted, federally managed lake on the California-Oregon border, along with a Native American tribe fighting to protect fragile salmon, will both receive extremely limited amounts of water this summer as a historic drought and record-low reservoir levels drag on in the U.S. West.\nMore than 1,000 farmers and ranchers who draw water from a 257-mile-long (407-kilometer

In [25]:
titles = [ k.title for k in titles.values() ] # gdelt_links.tolist()

In [None]:
get_category(titles)

In [25]:
gdelt_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,1038897109,20210411,202104,2021,2021.2767,EDU,SCHOOL,,,,...,1,Jordan,JO,JO,,31.0000,36.0000,JO,20220411231500,https://auburnpub.com/news/local/education/cay...
1,1038897110,20210411,202104,2021,2021.2767,UAF,FIGHTER,,,,...,0,,,,,,,,20220411231500,https://www.rnz.co.nz/news/national/465117/fau...
2,1038897111,20210411,202104,2021,2021.2767,UAF,FIGHTER,,,,...,0,,,,,,,,20220411231500,https://www.rnz.co.nz/news/national/465117/fau...
3,1038897112,20210411,202104,2021,2021.2767,USA,KANSAS CITY,USA,,,...,2,"Kansas, United States",US,USKS,,38.5111,-96.8005,KS,20220411231500,https://www.stltoday.com/news/local/govt-and-p...
4,1038897113,20210411,202104,2021,2021.2767,USA,UNITED STATES,USA,,,...,3,"Klamath, California, United States",US,USCA,CA015,41.5265,-124.0380,277534,20220411231500,https://www.lmtonline.com/news/article/Farms-f...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1004,1038898113,20220411,202204,2022,2022.2767,idg,INDIGENOUS,,,idg,...,4,"Moose Factory, Ontario, Canada",CA,CA08,154724,51.2500,-80.6000,-569576,20220411231500,https://www.cbc.ca/news/canada/sudbury/indigen...
1005,1038898114,20220411,202204,2022,2022.2767,idg,INDIGENOUS,,,idg,...,4,"James Bay, Canada (general), Canada",CA,CA00,154724,53.0000,-80.5000,-1506381,20220411231500,https://www.cbc.ca/news/canada/sudbury/indigen...
1006,1038898115,20220411,202204,2022,2022.2767,ltn,LATINOS,,,ltn,...,2,"Texas, United States",US,USTX,,31.1060,-97.6475,TX,20220411231500,http://historynewsnetwork.org/article/182931
1007,1038898116,20220411,202204,2022,2022.2767,nai,NATIVE AMERICAN,,,nai,...,2,"New York, United States",US,USNY,,42.1497,-74.9384,NY,20220411231500,https://www.longisland.com/news/04-11-22/long-...
