# Leer todos los documentos en un directorio y sus subdirectorios

In [1]:
import os

def read_all_documents(root):
    labels = []
    docs = []
    for r, dirs, files in os.walk(root):
        for file in files:
            with open(os.path.join(r, file), encoding="utf-8", mode="r") as f:
                docs.append(f.read())     
            labels.append(r.replace(root, ''))
    return dict([('docs', docs), ('labels', labels)])

In [2]:
data = read_all_documents('training')
documents = data['docs']
labels = data['labels']

# Contar la frecuencia de las palabras en los documentos

In [3]:
import re
from collections import defaultdict

def tokens(doc):
    return (tok.lower() for tok in re.findall(r"\w+", doc))

def frequency(tokens):
    f = defaultdict(int)
    for token in tokens:
        f[token] += 1
    return f

def tokens_frequency(doc):
    return frequency(tokens(doc))

# Extraer las características de los documentos

## Nombres de características simbólicas

In [4]:
from sklearn.feature_extraction import DictVectorizer, FeatureHasher

vectorizer = DictVectorizer()
vectorizer.fit_transform(tokens_frequency(d) for d in documents)

vectorizer.get_feature_names()

['0',
 '00',
 '000',
 '02',
 '024',
 '03',
 '042',
 '05',
 '068',
 '069',
 '072',
 '086',
 '095',
 '1',
 '10',
 '100',
 '101',
 '102',
 '106',
 '107',
 '11',
 '110',
 '113',
 '115',
 '12',
 '120',
 '127',
 '13',
 '130',
 '134',
 '135',
 '139',
 '13şc',
 '14',
 '140',
 '146',
 '148',
 '15',
 '150',
 '155',
 '157',
 '15km',
 '16',
 '160',
 '166',
 '1662',
 '16m',
 '16şc',
 '17',
 '173',
 '176',
 '1781',
 '1799',
 '18',
 '180',
 '185',
 '1859',
 '186',
 '1864',
 '1872',
 '1880',
 '189',
 '1891',
 '19',
 '194',
 '1943',
 '1945',
 '1948',
 '1957',
 '1958',
 '1961',
 '1970',
 '1978',
 '1979',
 '1980',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1997',
 '1998',
 '1999',
 '1a',
 '2',
 '20',
 '200',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2007',
 '2008',
 '2009',
 '201',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2019urrutia',
 '2020',
 '2021',
 '2026',
 '2030',
 '2040',
 '2

## Sparse matrices

In [5]:
#http://www.abc.es/economia/abci-bufetes-intentan-accionistas-bankia-vayan-juicio-201602190746_noticia.html
#hasher = FeatureHasher(n_features=2**8)
#X = hasher.transform(tokens_frequency(d) for d in documents)

hasher = FeatureHasher(n_features=2**8, input_type="string")
X = hasher.transform(tokens(d) for d in documents)

print(X.toarray())

[[-1.  0.  0. ...  0.  0. -3.]
 [ 3. -3. -1. ...  0.  0. -1.]
 [ 1. -4. -3. ...  2.  0. -4.]
 ...
 [ 7. -3. -3. ...  0. -3. -8.]
 [ 2.  0.  0. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]]


# Entrenar el clasificador de texto usando K-Means clustering
Ver http://scikit-learn.org/stable/auto_examples/text/document_classification_20newsgroups.html

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors

prepositions =['a','ante','bajo','cabe','con','contra','de','desde','en','entre','hacia','hasta','para','por','según','sin','so','sobre','tras']
prep_alike = ['durante','mediante','excepto','salvo','incluso','más','menos']
adverbs = ['no','si','sí']
articles = ['el','la','los','las','un','una','unos','unas','este','esta','estos','estas','aquel','aquella','aquellos','aquellas']
aux_verbs = ['he','has','ha','hemos','habéis','han','había','habías','habíamos','habíais','habían']
tfid = TfidfVectorizer(stop_words=prepositions+prep_alike+adverbs+articles+aux_verbs)

X_train = tfid.fit_transform(documents)
y_train = labels

clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=10, p=2,
           weights='uniform')

# Predecir categorías para nuevos artículos

In [7]:
test = read_all_documents('test')
X_test = tfid.transform(test['docs'])
y_test = test['labels']
pred = clf.predict(X_test)

print('accuracy score %0.3f' % clf.score(X_test, y_test))

accuracy score 0.913


## Prueba con articulos nuevos y frescos

In [9]:
#import eatiht.v2 as v2
import urllib.request
import requests
from bs4 import BeautifulSoup


def predict_category(url, classifier):    

    r = requests.get(url)
    html_content = r.text
    soup = BeautifulSoup(html_content, 'lxml')    
    article = soup.get_text()
    
    X_test = tfid.transform([article])
    return clf.predict(X_test)[0]

def show_predicted_categories(urls, classifier):
    for url in urls:
        print('Categorización de artículo: ' + predict_category(url, clf))

show_predicted_categories(
    [
        'https://trome.pe/mundial/brasil-vs-belgica-vivo-directo-tv-online-cuartos-final-mundial-rusia-2018-88466',
        'http://www.abc.es/economia/abci-bufetes-intentan-accionistas-bankia-vayan-juicio-201602190746_noticia.html',
        'http://www.elconfidencial.com/deportes/futbol/2016-02-19/torres-atletico-cope_1154857/',
        'http://archivo.elcomercio.pe/ciencias/investigaciones/vaticano-organiza-conferencia-sobre-agujeros-negros-noticia-1990705'],
    clf)

Categorización de artículo: \sports
Categorización de artículo: \economy
Categorización de artículo: \sports
Categorización de artículo: \science
