In [None]:
import spacy
import pandas as pd
import numpy as np
from newsplease import NewsPlease

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import RidgeClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [None]:
df = pd.read_csv("/home/felipenuti/Downloads/bbc_news_classification/bbc-news-data.csv", sep = '\t') 

In [None]:
categories = pd.Categorical(df['category'])
df['code'] = categories.codes

In [None]:
code_to_cat = {code: df[df['code'] == code]['category'].unique()[0] for code in range(5)}
code_to_cat


{0: 'business', 1: 'entertainment', 2: 'politics', 3: 'sport', 4: 'tech'}

In [None]:
df.head(3)

Unnamed: 0,category,filename,title,content,code
0,business,001.txt,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...,0
1,business,002.txt,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...,0
2,business,003.txt,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...,0


In [None]:
skf = StratifiedKFold(n_splits = 5, shuffle = True)

In [None]:
def test_model(estimator, model_name, folds):
    global df
    train_acc = []
    test_acc = []

    for fold, (train_idx, test_idx) in enumerate(folds):
        train_df = df.loc[train_idx, :].reset_index(drop = True)
        test_df = df.loc[test_idx, :].reset_index(drop = True)

        vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 5), stop_words='english')

        X_train = vectorizer.fit_transform(train_df['content'])
        X_test = vectorizer.transform(test_df['content'])

        y_train = train_df['code'] #pd.get_dummies(train_df['code']).values
        y_test = test_df['code'] #pd.get_dummies(test_df['code']).values

        estimator.fit(X_train, y_train)

        y_hat_train = estimator.predict(X_train)
        y_hat_test = estimator.predict(X_test)

        #print(y_train)
        #print(y_hat_train)

        train_acc.append(accuracy_score(y_train, y_hat_train))
        test_acc.append(accuracy_score(y_test, y_hat_test))

        print("Fold {} - {}".format(fold, model_name))
        print(f"\tTrain: {train_acc[-1]}\tTest: {test_acc[-1]}")




In [None]:
def train_model(estimator, vectorizer):
    global df

    X_train = vectorizer.fit_transform(df['content'])

    y_train = df['code']

    estimator.fit(X_train, y_train)

    y_hat_train = estimator.predict(X_train)

    #print(y_train)
    #print(y_hat_train)

    acc = accuracy_score(y_train, y_hat_train)
    print(acc)

In [None]:
test_model(RidgeClassifier(), "RidgeClassifier", skf.split(df.index, df.code))

Fold 0 - RidgeClassifier
	Train: 1.0
	Test: 0.9887640449438202
Fold 1 - RidgeClassifier
	Train: 1.0
	Test: 0.9910112359550561
Fold 2 - RidgeClassifier
	Train: 1.0
	Test: 0.9797752808988764
Fold 3 - RidgeClassifier
	Train: 1.0
	Test: 0.9797752808988764
Fold 4 - RidgeClassifier
	Train: 1.0
	Test: 0.9842696629213483


In [None]:
# test_model(SVC(), "SVC", skf.split(df.index, df.code))

Fold 0 - SVC
	Train: 1.0	Test: 0.9730337078651685
Fold 1 - SVC
	Train: 1.0	Test: 0.9932584269662922
Fold 2 - SVC
	Train: 1.0	Test: 0.9887640449438202
Fold 3 - SVC
	Train: 1.0	Test: 0.9775280898876404
Fold 4 - SVC
	Train: 1.0	Test: 0.9820224719101124


In [None]:
ridgeClassifier = RidgeClassifier()
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 5), stop_words='english')
train_model(ridgeClassifier, vectorizer)
pipe = Pipeline([
    ('vectorizer', vectorizer),
    ('ridge classifier', ridgeClassifier)
])

1.0


In [None]:
gdelt_data = pd.read_csv("~/Downloads/20220411231500.export.CSV", sep = '\t', header=None)

In [None]:
gdelt_links = gdelt_data[60]

In [None]:
import unidecode
import requests

In [None]:
# Get sentiment from BERT model
def prepare_text(text):
    return unidecode.unidecode(text.strip())

def get_category(texts, url = 'http://127.0.0.1:8080/news-classification'):
    js = { 'texts': texts }
    res = requests.post(url, json = js)
    return res.json()['ans']

def get_tweet_sentiment(tweet):
    sent = get_category(tweet['text'])
    return sent

In [None]:
i = np.random.randint(gdelt_links.size)
link = gdelt_links[i]
news = NewsPlease.from_url(link)
js = news.get_dict()
if 'maintext' in js.keys():
    pred = get_category([js['title']]) # pipe.predict([js['maintext']])[0]
    js['category'] = pred #code_to_cat[pred]
    print(js['title'])
    print(js['maintext'][:100])
    print(js['category'])



North Carolina’s experimental school takeover program is officially ending this summer
The State Board of Education has voted to transfer a public school taken over by the state back to i
[{'classes': {'Business': 0.6974193453788757, 'Sci/Tech': 0.2717726230621338, 'Sports': 0.0036270355340093374, 'World': 0.027180952951312065}, 'text': 'North Carolina’s experimental school takeover program is officially ending this summer', 'vec': [0.027180952951312065, 0.0036270355340093374, 0.6974193453788757, 0.2717726230621338]}]


In [None]:
titles = NewsPlease.from_urls(gdelt_links.tolist())