In [4]:
import spacy
import nltk
from pprint import pprint
import random
from sklearn import preprocessing
import numpy as np
nltk.download("all", quiet=True)

nlp = spacy.load("en_core_web_sm")


In [5]:
import os
def load_corpus(folder):
    corpus = []
    for root, dirs, files in os.walk(folder, topdown=False):
        for name in files:
            try:
                with open(os.path.join(root, name), "r") as fp:
                    corpus.append(fp.read())
            except UnicodeDecodeError as e:
                print(e.__str__(), "for", os.path.join(root, name))
    return corpus

def load_corpuses(folder):
    sub_folders = []
    for root, dirs, files in os.walk(folder):
        if dirs:
            for dir_ in dirs:
                sub_folders.append(dir_)

    corpuses = {}
    print(sub_folders)
    for sub_folder in sub_folders:
        print(os.path.join(folder, sub_folder))
        corpuses[sub_folder] = load_corpus(os.path.join(folder, sub_folder))
    return corpuses

corpuses = load_corpuses("bbc")

['business', 'tech', 'entertainment', 'sport', 'politics']
bbc/business
bbc/tech
bbc/entertainment
bbc/sport
'utf-8' codec can't decode byte 0xa3 in position 257: invalid start byte for bbc/sport/199.txt
bbc/politics


In [6]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from nltk.corpus import stopwords

vectorizer = CountVectorizer(stop_words=stopwords.words('english'))
tfid = TfidfTransformer()

all_ = []
for corpus in corpuses:
    all_ += corpuses[corpus]

vectorizer.fit(all_)
tfid.fit(vectorizer.transform(all_))

# x = vectorizer.transform(corpuses["tech"])
# vectorizer.vocabulary_.get('Sony')

TfidfTransformer()

In [7]:
import math
x = []
y =[]
from tqdm.notebook import tqdm

entity_types = CountVectorizer(stop_words=stopwords.words('english'))
entity_types.fit(['CARDINAL', 'PERSON', 'GPE', 'MONEY', 'ORG', 'ORDINAL', 'WORK_OF_ART', 'NORP', 'PERCENT', 'DATE', 'LANGUAGE', 'FAC', 'LOC', 'TIME', 'PRODUCT', 'EVENT', 'QUANTITY', 'LAW'])
for corpus in tqdm(corpuses):
    for story in tqdm(corpuses[corpus]):
        analysed = nlp(story)
        # print(entity_types.transform([tag.label_ for tag in analysed.ents]).toarray()[0])
        # print(vectorizer.transform([story]).toarray()[0])
        # print(list(tfid.transform(vectorizer.transform([story])).toarray()[0]))
        x.append(list(vectorizer.transform([story]).toarray()[0]) + 
                 list(entity_types.transform([tag.label_ for tag in analysed.ents]).toarray()[0]) +
                 list(tfid.transform(vectorizer.transform([story])).toarray()[0])
                )
        # x.append(tfid.transform(vectorizer.transform([story])).toarray()[0])

        y.append(corpus)

import random

c = list(zip(x, y))

random.shuffle(c)

x, y = zip(*c)
# pprint(x[:3])
# pprint(y[:3])

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/510 [00:00<?, ?it/s]

  0%|          | 0/401 [00:00<?, ?it/s]

  0%|          | 0/386 [00:00<?, ?it/s]

  0%|          | 0/510 [00:00<?, ?it/s]

  0%|          | 0/417 [00:00<?, ?it/s]

In [8]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

get_best=SelectKBest(chi2, k=500).fit(x, y)
x_chi = get_best.transform(x)

In [9]:
print(len(x), len(x_chi))

2224 2224


In [10]:
size_dataset_full=len(x_chi)
size_test=int(round(size_dataset_full*0.2,0))

list_test_indices=random.sample(range(size_dataset_full), size_test)

test_x = []
test_y = []
train_x = []
train_y = []

for i,example in enumerate(x_chi):
  if i in list_test_indices:
      test_x.append(example)
      test_y.append(y[i])
  else:
      train_x.append(example)
      train_y.append(y[i])

le = preprocessing.LabelEncoder()
le.fit(train_y + test_y)
train_y = le.transform(train_y)
test_y = le.transform(test_y)

In [11]:
print(test_y)

[0 3 4 3 2 1 0 4 3 2 4 3 0 3 1 1 2 0 4 3 0 0 1 0 3 4 4 2 1 4 2 4 0 0 0 0 0
 0 3 3 4 2 4 2 4 1 1 0 0 0 3 4 1 2 3 2 0 2 4 1 2 0 3 2 3 4 3 0 3 0 4 1 2 3
 0 0 3 2 3 2 1 1 0 3 0 1 4 2 0 3 2 3 1 1 0 4 3 2 0 0 0 2 1 1 2 4 2 0 1 0 0
 2 0 4 4 4 2 1 1 3 3 3 0 0 1 3 4 3 3 1 1 4 4 4 0 1 4 1 1 2 0 2 3 3 2 4 3 2
 0 4 3 1 3 2 3 2 3 0 0 1 3 2 0 3 1 0 2 3 2 0 2 2 1 0 1 3 4 3 3 1 0 2 0 2 1
 3 4 0 1 1 0 4 4 2 3 2 1 0 1 3 2 0 3 3 1 3 1 0 0 4 1 0 3 2 0 2 3 0 4 0 4 3
 1 0 0 0 2 3 1 1 1 3 1 1 4 3 2 3 1 2 3 3 3 0 4 3 2 2 2 4 3 0 2 2 2 0 3 0 0
 3 4 3 1 2 0 2 1 0 0 3 1 4 2 2 4 0 2 0 3 3 1 3 2 3 4 4 0 1 0 1 4 4 3 0 0 0
 3 4 3 0 2 1 0 3 2 2 2 2 3 0 2 0 3 1 1 4 4 4 3 0 2 1 0 3 1 4 3 1 0 1 4 4 3
 1 3 1 3 1 3 0 1 2 0 0 4 0 4 1 4 4 0 0 3 3 0 0 2 1 3 0 0 2 0 2 2 2 1 1 2 3
 1 2 3 1 3 3 3 3 4 0 4 4 1 2 0 1 0 3 0 4 1 4 1 0 2 2 0 0 2 4 0 4 0 0 2 3 0
 2 3 1 3 2 1 3 1 3 0 0 4 1 4 0 0 1 3 0 1 1 3 2 0 1 3 4 2 4 1 0 2 0 0 0 3 0
 4]


In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm

svm_clf=make_pipeline(StandardScaler(), svm.SVC(cache_size=10000, decision_function_shape='ovo'))

In [13]:
svm_clf.fit(train_x, train_y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(cache_size=10000, decision_function_shape='ovo'))])

In [14]:
Y_text_predictions = svm_clf.predict(test_x)

In [15]:
from sklearn.metrics import classification_report
print(classification_report(test_y, Y_text_predictions))

              precision    recall  f1-score   support

           0       0.95      0.93      0.94       114
           1       1.00      0.89      0.94        83
           2       0.99      0.93      0.96        81
           3       0.97      0.97      0.97        99
           4       0.80      0.99      0.88        68

    accuracy                           0.94       445
   macro avg       0.94      0.94      0.94       445
weighted avg       0.95      0.94      0.94       445



In [16]:
def predict(story):
    return le.inverse_transform(
        svm_clf.predict(
            get_best.transform(
                [
                    list(vectorizer.transform([story]).toarray()[0]) + 
                    list(entity_types.transform([tag.label_ for tag in nlp(story).ents]).toarray()[0]) +
                    list(tfid.transform(vectorizer.transform([story])).toarray()[0])
                ]
            )
        )
    )[0]


predict("""
Greene sets sights on world title

Maurice Greene aims to wipe out the pain of losing his Olympic 100m title in Athens by winning a fourth World Championship crown this summer.

He had to settle for bronze in Greece behind fellow American Justin Gatlin and Francis Obikwelu of Portugal. "It really hurts to look at that medal. It was my mistake. I lost because of the things I did," said Greene, who races in Birmingham on Friday. "It's never going to happen again. My goal - I'm going to win the worlds." Greene crossed the line just 0.02 seconds behind Gatlin, who won in 9.87 seconds in one of the closest and fastest sprints of all time. But Greene believes he lost the race and his title in the semi-finals. "In my semi-final race, I should have won the race but I was conserving energy. "That's when Francis Obikwelu came up and I took third because I didn't know he was there. "I believe that's what put me in lane seven in the final and, while I was in lane seven, I couldn't feel anything in the race.

"I just felt like I was running all alone. "I believe if I was in the middle of the race I would have been able to react to people that came ahead of me." Greene was also denied Olympic gold in the 4x100m men's relay when he could not catch Britain's Mark Lewis-Francis on the final leg. The Kansas star is set to go head-to-head with Lewis-Francis again at Friday's Norwich Union Grand Prix. The pair contest the 60m, the distance over which Greene currently holds the world record of 6.39 seconds. He then has another indoor meeting in France before resuming training for the outdoor season and the task of recapturing his world title in Helsinki in August. Greene believes Gatlin will again prove the biggest threat to his ambitions in Finland. But he also admits he faces more than one rival for the world crown. "There's always someone else coming. I think when I was coming up I would say there was me and Ato (Boldon) in the young crowd," Greene said. "Now you've got about five or six young guys coming up at the same time."
""")

'sport'