In [2]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import pprint
import json
from json import JSONDecodeError

# jsoZn pretty-printer import
pp = pprint.PrettyPrinter(indent=4)

In [6]:
def extract_inshorts(categories, min_items):
    """
    Download stories from https://inshorts.com 
    
    :param categories: list of categories that you want to download 
    :param min_items: minimal amount of stories per each category
    :return: list of (story, category) touples
    """
    result = []
    for category in categories:
        labeled = []
        offset = ''
        while len(labeled) < min_items:
            downloaded = __download_for(category, offset)
            offset = downloaded['offset']
            labeled += downloaded['stories']
        result += labeled
    return result


def __download_for(category, offset):
    params = 'category=' + category + '&news_offset=' + offset
    resp = __request('POST', 'https://inshorts.com/en/ajax/more_news', params)
    obj = __load_json_safely(resp)
    stories = {'stories': __label(__parse_stories_from(obj['html']), category), 'offset': obj['min_news_id']}
    # pp.pprint(stories)
    return stories


def __label(stories, category):
    return [{"label": category, "content": story} for story in stories]


def __parse_stories_from(html):
    soup = BeautifulSoup(html, 'html.parser')
    unflattened = [i.contents for i in soup.findAll("div", {"itemprop": 'articleBody'})]
    stories_list = [item for sublist in unflattened for item in sublist]
    return stories_list


def __request(method, url, payload):
    headers = {
        'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8",
        'Cache-Control': "no-cache",
        'Postman-Token': "e2290974-00cb-40d4-92f4-f9b7c8398ce8"
    }
    response = requests.request(method, url, data=payload, headers=headers)
    return response.text


def __load_json_safely(serialized):
    try:
        return json.loads(serialized)
    except JSONDecodeError:
        print('ERROR: cannot deserialize json: ' + serialized)

In [15]:
# Scrape 100 business stories and 100 sports stories. Then put them into pandas DataFrame


def shuffle(data_frame):
    return data_frame.sample(frac=1)


def inshorts_to_df(categories, amount):
    data_frame = pd.DataFrame(extract_inshorts(categories, amount))
    return shuffle(data_frame)


df = inshorts_to_df(['business', 'sports'], 100)
df.head(10)

Unnamed: 0,content,label
89,"In an open letter, European lawmakers have ask...",business
18,Bajaj Auto MD Rajiv Bajaj has described the la...,business
128,"Indian boxer MC Mary Kom's opponent, North Kor...",sports
136,During the 16th over of Australia's innings in...,sports
195,"David Arquette, actor and the ex-husband of Am...",sports
204,"Ian Chappell, the Australian cricket team's fo...",sports
140,Germany's 17-year-old car racer Sophia Floersc...,sports
40,Facebook Co-founder and CEO Mark Zuckerberg in...,business
62,Niti Aayog's former Vice Chairman Arvind Panag...,business
110,Mohammad Shami bowled 26 overs in the first in...,sports


In [19]:
df.head(10)

Unnamed: 0,content,label
89,"In an open letter, European lawmakers have ask...",business
18,Bajaj Auto MD Rajiv Bajaj has described the la...,business
128,"Indian boxer MC Mary Kom's opponent, North Kor...",sports
136,During the 16th over of Australia's innings in...,sports
195,"David Arquette, actor and the ex-husband of Am...",sports
204,"Ian Chappell, the Australian cricket team's fo...",sports
140,Germany's 17-year-old car racer Sophia Floersc...,sports
40,Facebook Co-founder and CEO Mark Zuckerberg in...,business
62,Niti Aayog's former Vice Chairman Arvind Panag...,business
110,Mohammad Shami bowled 26 overs in the first in...,sports


In [8]:
# importing tools required for text pre-processing
%matplotlib inline
import re
import nltk
import unicodedata
from functools import reduce
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()

lemmatizer = WordNetLemmatizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [9]:
def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def apply_on(collection, fun):
    mapped = list(map(lambda x: (fun(x), 1 if fun(x) != x else 0), collection))
    is_changed = list(map(lambda x: x[1], mapped))
    changed_count = reduce(lambda x, y: x + y, is_changed)
    collection = list(map(lambda x: x[0], mapped))
    return collection, changed_count


def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


def lemmatize(text):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    return text


def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [10]:
# chain of text pre-processing functions to apply
applies = [
    lambda s: s.lower(),
    remove_accented_chars,
    remove_special_characters,
    lemmatize,
    remove_stopwords
]


def preprocess_text(text):
    return reduce(lambda res, fun: fun(res), applies, text)


def preprocess_series(series):
    return reduce(lambda res, fun: list(map(fun, res)), applies, series)


df.content = preprocess_series(df.content)

# pre-processing example
preprocess_text("This sentence should be preprocessed and ready to go. "
                "Dogs are able to run over 10 miles without any rest. "
                "Buying a train is not a good idea if you do not have money")

'sentence preprocessed ready go dog able run mile without rest buying train good idea money'

In [12]:
df.head(10)

Unnamed: 0,content,label
135,australia defeated india run dl raincurtailed ...,sports
25,whatsapp ha appointed abhijit bobby bose head ...,business
59,ahead rbi board meeting monday congress presid...,business
134,th australia inning first indiaaustralia ti we...,sports
184,india defeated australia run final woman world...,sports
210,south africa defeated australia run sole ti wa...,sports
172,music composer ar rahman ha released promo vid...,sports
58,apple ceo tim cook ha defended apple billiondo...,business
170,indian captain virat kohli said wa okay play w...,sports
161,alexander zverev germany defeated world number...,sports


In [47]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


tfidf_transformer = TfidfTransformer()
count_vect = CountVectorizer()

In [51]:
def transform_tfidf(data):
    X_counts = count_vect.fit_transform(data)
    return tfidf_transformer.fit_transform(X_counts)


def transform(test, train):
    transformed = transform_tfidf(test + train)
    return transformed[:len(test)], transformed[len(test):]


def split_df(df):
    return df.content.tolist(), df.label.tolist()

In [73]:
X, y = split_df(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_test_transformed, X_train_transformed = transform(X_test, X_train)

clf = MultinomialNB()
clf.fit(X_train_transformed, y_train)
clf.score(X_test_transformed, y_test)

0.9285714285714286

In [19]:
def predict(text_list, X, y, model):
    text_transformed, X_transformed = transform(list(map(lambda text: preprocess_text(text), text_list)), X)
    model.fit(X_transformed, y)
    return model.predict(text_transformed)

In [20]:
# Let's try to predict text category (whether it's rather business or sport)

predictables = [
    "Marlena is a really nice cat. Despite her young age, she has her own company with 1234 people employed",
    "Marlena is a really successful cat. She won three olympic medals in swimming.",
    "The WTO now expects global merchandise trade volumes to expand by around 3.9 per cent in 2019, "
    "down from the central 4.4 per cent forecast it made in April",
    "Goldman Sachs has launched its online retail bank, Marcus, for UK customers, "
    "with a savings account that offers interest of 1.5 per cent.",
    "Across Europe there are around 20 players who, were they not based abroad, "
    "would be strong contenders for Argentina’s squad.",
    "swimming pool"
]

for story, category in zip(predictables, predict(predictables, X, y, MultinomialNB())):
    print("STORY: " + story)
    print("CATEGORY: " + category + "\n")

STORY: Marlena is a really nice cat. Despite her young age, she has her own company with 1234 people employed
CATEGORY: business

STORY: Marlena is a really successful cat. She won three olympic medals in swimming.
CATEGORY: business

STORY: The WTO now expects global merchandise trade volumes to expand by around 3.9 per cent in 2019, down from the central 4.4 per cent forecast it made in April
CATEGORY: business

STORY: Goldman Sachs has launched its online retail bank, Marcus, for UK customers, with a savings account that offers interest of 1.5 per cent.
CATEGORY: business

STORY: Across Europe there are around 20 players who, were they not based abroad, would be strong contenders for Argentina’s squad.
CATEGORY: sports

STORY: swimming pool
CATEGORY: business



In [88]:
# Predict many categories
multiple_categories_df = inshorts_to_df(['business', 'sports', 'politics', 'technology', 'entertainment', 'startup'], 100)

In [89]:
multiple_categories_df.head(15)

Unnamed: 0,content,label
310,Accusing the Congress of jeopardising Madhya P...,politics
526,"Tesla CEO Elon Musk, in a tweet apparently moc...",startup
600,Uber has agreed to pay $148 million settlement...,startup
73,The Unique Identification Authority of India (...,business
256,"On Mahatma Gandhi's 149th birth anniversary, C...",politics
535,Talking about foreign companies building their...,startup
402,A Japanese cafe is planning to use robotic wai...,technology
182,England's Football League Two side Notts Count...,sports
142,The Pakistan Rangers stationed at the India-Pa...,sports
266,BJP MLA Sangeet Som survived an attack on Thur...,politics


In [113]:
from sklearn.svm import SVC

In [167]:
multiple_categories_df.content = preprocess_series(multiple_categories_df.content)

X, y = split_df(multiple_categories_df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_test_transformed, X_train_transformed = transform(X_test, X_train)

clf = MultinomialNB()
clf.fit(X_train_transformed, y_train)
clf.score(X_test_transformed, y_test)

0.8031496062992126

In [None]:
# Results still are quite good