In [1]:
import pandas as pd 
import re
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
# from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from sklearn.cluster import KMeans, MiniBatchKMeans

import logging
from optparse import OptionParser
import sys
from time import time

import numpy as np

In [2]:
articles = pd.read_csv('E:/OneDrive - CELSIA S.A E.S.P/Maestría/Almacenamiento/articles.csv',sep = ',', encoding= 'utf-8')
print(f"La cantidad de columnas es: {len(articles.columns)}")
print(f"Las candidatas a ser seleccionada para un procesamiento de texto serían: title, description y subject")
articles.head(5)

La cantidad de columnas es: 5
Las candidatas a ser seleccionada para un procesamiento de texto serían: title, description y subject


Unnamed: 0,identifier,title,description,subject,creator
0,http://arxiv.org/abs/0704.3504,Smooth R\'enyi Entropy of Ergodic Quantum Info...,We prove that the average smooth Renyi entro...,Quantum Physics ; Computer Science - Informati...,"Schoenmakers, Berry ; Tjoelker, Jilles ; Tuyls..."
1,http://arxiv.org/abs/0706.1402,Analyzing Design Process and Experiments on th...,"In the field of tutoring systems, investigat...",Computer Science - Computers and Society ; Com...,"Brust, Matthias R. ; Rothkugel, Steffen ;"
2,http://arxiv.org/abs/0710.0736,Colour image segmentation by the vector-valued...,We propose a new method for the numerical so...,Computer Science - Computer Vision and Pattern...,"Kay, David A ; Tomasi, Alessandro ;"
3,http://arxiv.org/abs/0803.2570,Unequal Error Protection: An Information Theor...,An information theoretic framework for unequ...,Computer Science - Information Theory ; Comput...,"Borade, Shashi ; Nakiboglu, Baris ; Zheng, Liz..."
4,http://arxiv.org/abs/0808.0084,On the hitting times of quantum versus random ...,In this paper we define new Monte Carlo type...,Quantum Physics ; Computer Science - Data Stru...,"Magniez, Frederic ; Nayak, Ashwin ; Richter, P..."


In [3]:
articles['combine_column'] = articles['title']+articles['description']
stopWords = stopwords.words('english')

In [4]:
def clean_files(texto):
    #Quitar todos los acentos
    #texto = unidecode.unidecode(texto)
    
    #Quitar todos los caracteres especiales
    texto = re.sub('[^A-Za-z0-9]+',' ',texto)
    texto =re.sub('((et al\.)|(i\.i\.d\.)|(i\.e\.)|\'|\’|\`)',' ',texto) # Eliminar abreviaciones, apostrofes y guion
    texto =re.sub('(á|à|ä)','a',texto) # Reemplazar a acentuada
    texto =re.sub('(é|è|ë)','e',texto) # Reemplazar e acentuada
    texto =re.sub('(í|ì|ï)','i',texto) # Reemplazar i acentuada
    texto =re.sub('(ó|ò|ö)','o',texto) # Reemplazar o acentuada
    texto =re.sub('(ú|ù|ü)','u',texto) # Reemplazar u acentuada
    texto =re.sub('-|\u2212|\u2012|\u2013|\u2014|\u2015',' ',texto) # Reemplazar el guión
    texto =re.sub('[^a-zA-Z]',' ',texto) # Eliminar caracteres que no sean: letra, número o vocales acentuadas
    
    #Pasar todo a minisculas
    texto = texto.lower()
    
    #Tokenizar
    tokens = texto.split()
    
    tokens2 = [w for w in tokens if (len(w)>1)&(w.isalpha())&(w not in stopWords)]
    
    #Lematización
    word_net_lemmatizar = WordNetLemmatizer()

    tokens3 = [word_net_lemmatizar.lemmatize(w, pos = "v") for w in tokens2]
   
    #Stemmer
#     ps = PorterStemmer() 
#     tokens4 = [ps.stem(w) for w in tokens3]
     
    #Se retorna el texto nuevamente en un solo string luego de ser procesado
    to_return = ' '.join(tokens3)

    return to_return

In [5]:
articles['combine_cleaned'] = articles['combine_column'].apply(clean_files)

In [6]:
articles['cuenta'] = articles['combine_cleaned'].apply(lambda x: len(x.split()))

In [7]:
total_palabras = articles['cuenta'].sum()

# Número de grupos

In [8]:
true_k = 15

In [9]:
#Es una matriz dispersa que contiene el TF-IDF

#limite de palabras el número total de palabras en max_features y 
#el min_df el número minimo de documentos en los que debe estar para ser considerada en el BoW

vectorizer = TfidfVectorizer(max_df=0.5, max_features=total_palabras,
                                 min_df=1, stop_words='english',
                                 use_idf=True)

In [10]:
X = vectorizer.fit_transform(articles['combine_cleaned'].values)

In [11]:
print(X.shape)

(980, 8081)


In [12]:
svd = TruncatedSVD(n_components=500 )
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

Xlsa = lsa.fit_transform(X)


explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

Explained variance of the SVD step: 75%


In [16]:
print(Xlsa.shape)

(980, 500)


In [17]:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, n_init=1)

print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(Xlsa)
print("done in %0.3fs" % (time() - t0))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(Xlsa, km.labels_, sample_size=1000))

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=1000,
    n_clusters=15, n_init=1, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)
done in 0.127s
Silhouette Coefficient: 0.010


In [18]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]


terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: aapso abilities abbreviate abmash actors abac accessible acp adapt affine
Cluster 1: aapso abbreviate aaronson abilities ability abelianized able academic abramsky abs
Cluster 2: aapso activities abstract accord activate accomplish academic abstraction absent absoluteness
Cluster 3: aaronson aapso abac abrego absolutely able absolute abelian abmash absoluteness
Cluster 4: aapso abelianized absoluteness abridge abbreviate accentuate actually accelerators activities accessible
Cluster 5: abboud aapso ability abmash abelian abrupt academic additive acp adjoin
Cluster 6: aapso abac abboud abbreviate abramsky abs abelianized abmash absence ability
Cluster 7: aapso acceleration abramsky accord abide acquire academics activity achievability act
Cluster 8: aapso abelian abbreviate abs abstract accentuate acausal accommodate abstractions abramsky
Cluster 9: aapso abide accept abundance absence abrupt ac access absolutely accordingly
Cluster 10: aapso abilities abac able absent absenc

In [20]:
articles['cluster'] = km.labels_

In [24]:
articles['name_file'] = articles['identifier'].apply(lambda x: x.split('/')[-1]+'.txt')

In [27]:
articles.to_csv('docByCluster.csv', index = False)