In [134]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import pprint
import json
from json import JSONDecodeError

# json pretty-printer import
pp = pprint.PrettyPrinter(indent=4)

In [135]:
def extract_inshorts(categories, min_items):
    """
    Download stories from https://inshorts.com 
    
    :param categories: list of categories that you want to download 
    :param min_items: minimal amount of stories per each category
    :return: list of (story, category) touples
    """
    result = []
    for category in categories:
        labeled = []
        offset = ''
        while len(labeled) < min_items:
            downloaded = __download_for(category, offset)
            offset = downloaded['offset']
            labeled += downloaded['stories']
        result += labeled
    return result


def __download_for(category, offset):
    params = 'category=' + category + '&news_offset=' + offset
    resp = __request('POST', 'https://inshorts.com/en/ajax/more_news', params)
    obj = __load_json_safely(resp)
    stories = {'stories': __label(__parse_stories_from(obj['html']), category), 'offset': obj['min_news_id']}
    # pp.pprint(stories)
    return stories


def __label(stories, category):
    return [{"label": category, "content": story} for story in stories]


def __parse_stories_from(html):
    soup = BeautifulSoup(html, 'html.parser')
    unflattened = [i.contents for i in soup.findAll("div", {"itemprop": 'articleBody'})]
    stories_list = [item for sublist in unflattened for item in sublist]
    return stories_list


def __request(method, url, payload):
    headers = {
        'Content-Type': "application/x-www-form-urlencoded; charset=UTF-8",
        'Cache-Control': "no-cache",
        'Postman-Token': "e2290974-00cb-40d4-92f4-f9b7c8398ce8"
    }
    response = requests.request(method, url, data=payload, headers=headers)
    return response.text


def __load_json_safely(serialized):
    try:
        return json.loads(serialized)
    except JSONDecodeError:
        print('ERROR: cannot deserialize json: ' + serialized)

In [None]:
# Scrape 100 business stories and 100 sports stories. Then put them into pandas DataFrame

df = pd.DataFrame(extract_inshorts(['business', 'sports'], 100))

In [8]:
def shuffle(data_frame):
    return data_frame.sample(frac=1)


# DataFrame shuffling. Business stories should be mixed with sport ones
df = shuffle(df)

In [9]:
# importing tools required for text pre-processing
%matplotlib inline
import re
import nltk
import unicodedata
from functools import reduce
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer

tokenizer = ToktokTokenizer()
lemmatizer = WordNetLemmatizer()
stopword_list = nltk.corpus.stopwords.words('english')

In [10]:
def remove_accented_chars(text):
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')


def apply_on(collection, fun):
    mapped = list(map(lambda x: (fun(x), 1 if fun(x) != x else 0), collection))
    is_changed = list(map(lambda x: x[1], mapped))
    changed_count = reduce(lambda x, y: x + y, is_changed)
    collection = list(map(lambda x: x[0], mapped))
    return collection, changed_count


def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text


def lemmatize(text):
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
    return text


def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [45]:
# chain of text pre-processing functions to apply
applies = [
    lambda s: s.lower(),
    remove_accented_chars,
    remove_special_characters,
    lemmatize,
    remove_stopwords
]

df.content = reduce(lambda res, fun: list(map(fun, res)), applies, df.content)


def preprocess_text(text):
    return reduce(lambda res, fun: fun(res), applies, text)


# pre-processing example
preprocess_text("This sentence should be preprocessed and ready to go. "
                "Dogs are able to run over 10 miles without any rest. "
                "Buying a train is not a good idea if you do not have money")

'sentence preprocessed ready go dog able run mile without rest buying train good idea money'

In [12]:
df.head(10)

Unnamed: 0,content,label
85,delhi police ha arrested four people including...,business
50,indian oil bharat petroleum nayara energy mang...,business
116,team india standin captain rohit sharma took s...,sports
36,among regular employee india monthly average e...,business
117,pakistan crashed asia cup defeat bangladesh we...,sports
27,supreme court wednesday ruled linking person a...,business
101,pnb fraud accused mehul choksi moved cbi court...,business
1,india organisation chemist druggist ha declare...,business
118,talking former indian captain dhonis ability t...,sports
2,ahmedabadbased ecommerce company infibeam lost...,business


In [13]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

tfidf_transformer = TfidfTransformer()
count_vect = CountVectorizer()

In [89]:
from sklearn.model_selection import train_test_split


def transform_tfidf(data):
    X_counts = count_vect.fit_transform(data)
    return tfidf_transformer.fit_transform(X_counts)


X = df.content.tolist()
y = df.label.tolist()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [93]:
clf = MultinomialNB()

def transform(test, train):
    transformed = transform_tfidf(test + train)
    return transformed[:len(test)], transformed[len(test):]


X_test_transformed, X_train_transformed = transform(X_test, X_train)

clf.fit(X_train_transformed, y_train)

clf.score(X_test_transformed, y_test)

1.0

In [102]:
def predict(text_list):
    clf = MultinomialNB()
    text_transformed, X_transformed = transform(list(map(lambda text: preprocess_text(text), text_list)), X)
    clf.fit(X_transformed, y)
    return clf.predict(text_transformed)

In [None]:
# Let's try to predict text category (whether it's rather business or sport)

predictables = [
    "Marlena is a really nice cat. Despite her young age, she has her own company with 1234 people employed",
    "Marlena is a really successful cat. She won three olympic medals in swimming.",
    "The WTO now expects global merchandise trade volumes to expand by around 3.9 per cent in 2019, "
    "down from the central 4.4 per cent forecast it made in April",
    "Goldman Sachs has launched its online retail bank, Marcus, for UK customers, "
    "with a savings account that offers interest of 1.5 per cent.",
    "Across Europe there are around 20 players who, were they not based abroad, "
    "would be strong contenders for Argentina’s squad.",
    "swimming pool"
]

for story, category in zip(predictables, predict(predictables)):
    print("STORY: " + story)
    print("CATEGORY: " + category + "\n")

STORY: Marlena is a really nice cat. Despite her young age, she has her own company with 1234 people employed
CATEGORY: business

STORY: Marlena is a really successful cat. She won three olympic medals in swimming.
CATEGORY: sports

STORY: The WTO now expects global merchandise trade volumes to expand by around 3.9 per cent in 2019, down from the central 4.4 per cent forecast it made in April
CATEGORY: business

STORY: Goldman Sachs has launched its online retail bank, Marcus, for UK customers, with a savings account that offers interest of 1.5 per cent.
CATEGORY: business

STORY: Across Europe there are around 20 players who, were they not based abroad, would be strong contenders for Argentina’s squad.
CATEGORY: sports

STORY: The Pumas have a new head coach in Mario Ledesma and cracks are already discernible in the “home-based only” policy introduced to try to keep Argentina’s better players at home.
CATEGORY: business

