In [1]:
import os
import re
import nltk
import numpy as np
import math

from numpy.linalg import norm
from numpy import dot

from collections import Counter 

from math import factorial as fct
from scipy.special import factorial

from decimal import Decimal

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/parzival/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/parzival/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/parzival/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
directory = "../collection/20_newsgroup"

In [4]:
def preprocess_line(text, lemmatizer=WordNetLemmatizer()):
    words = [w.lower() for w in word_tokenize(text)]
    tokens = []
    for token in words:
        tokens.extend(re.split('[^a-zA-Z]', token))
    token_list = [lemmatizer.lemmatize(token) for token in tokens if not token in stopwords.words('english')] 
    return list(filter(lambda token: len(token), token_list))

In [5]:
def preprocess_doc(doc_path):
    
    lemmatizer = WordNetLemmatizer()
    
    with open(doc_path, encoding="utf8", errors='ignore') as f:

        tokens = []
        endOfDoc=0
        isHeader=1
        
        data = list(filter( lambda line: line!="\n",f.readlines()))
        for line in data:
            text = line
            tokens.extend(preprocess_line(text,lemmatizer))
            
        return tokens

### Create Posting List

In [6]:
def create_posting(newsgroup="comp.graphics"):
    
    doc_id = -1
    doc_name = {}
    post_list = {}
    doc_len = {}
    idf = {}
    tf = {}
    
    working_dir = directory+"/"+newsgroup
    for file in os.listdir(working_dir):
        doc_id += 1
        tf[doc_id] = {}
        doc_len[doc_id] = 0
        
        filename = os.fsdecode(file)
        doc_name[doc_id] = filename
        
        file_path = os.path.join(working_dir,file)
        
        tokens = preprocess_doc(file_path)
        
        doc_len[doc_id] = len(tokens)
        
        for token in tokens:    
            if post_list.get(token) == None:
                post_list[token] = [doc_id]
                tf[doc_id][token] = 0
                idf[token] = 1
            elif tf[doc_id].get(token) == None:
                post_list[token].append(doc_id)
                tf[doc_id][token] = 0
                idf[token] += 1
            tf[doc_id][token] += 1
            
        for j in tf[doc_id].keys():
            tf[doc_id][j] /= doc_len[doc_id]
            
    for i in idf.keys():
        idf[i]=math.log10((doc_id+1)/idf[i])
    
    return {"doc_names": doc_name, "post_list": post_list, "doc_lens": doc_len, "tf":tf, "idf":idf}            

In [7]:
postings = create_posting()

### Text Clustering

In [41]:
class TextClustering:
    def __init__(self,posting_list):
        self.postings = posting_list
        self.word_ids = {}
        self.doc_vecs = self.__create_embeddings__(posting_list)
        
    def __create_embeddings__(self,posting_list):
        no_docs = len(posting_list["doc_names"].keys())
        doc_vec = np.zeros((len(posting_list["doc_names"]),len(posting_list["idf"])))
        cnt = 0
        word_ids = {}
        for w in posting_list["idf"].keys():
            word_ids[w] = cnt
            cnt += 1
        for i in range(no_docs):
            for j in posting_list["tf"][i].keys():
                doc_vec[i][word_ids[j]]=posting_list["tf"][i][j]*posting_list["idf"][j]
        for i in range(no_docs):
            if norm(doc_vec[i])>0:
                doc_vec[i]=doc_vec[i]/norm(doc_vec[i])
        
        self.word_ids = word_ids
        return doc_vec
    
    def __get_init_dist__(self,k):
        init = list(np.random.randint(0,k,len(self.postings["doc_names"])))
        cnt = Counter(init)
        keys = cnt.keys()
        keys_missing = [ x for x in range(k) if x not in keys]

        if len(keys_missing)!=0:
            c=0
            temp = {}
            for i in range(len(init)):
                if init[i] not in temp.keys():
                    temp[init[i]] = 1
                elif c<len(keys_missing):
                    init[i] = keys_missing[c]
                    c+=1
                    temp[init[i]] = 1
                else:
                    temp[init[i]] += 1  
        return init
    
    def __get_means__(self,k,cluster_x):
        means =  np.zeros((k,len(self.postings["idf"])))   
        for cl in range(len(cluster_x)):
            for d in cluster_x[cl]:
                means[cl] += self.doc_vecs[d]
            means[cl] /= len(cluster_x[cl]) 
        return means
    
    
    def __get_cor_mat__(self,means):
        k = len(means)
        cor_mat = np.zeros((k,len(self.postings["doc_names"])))
        k,docs = cor_mat.shape
        for m in range(k):
            for d in range(docs):
                cor_mat[m][d] = np.linalg.norm(self.doc_vecs[d] - means[m])
        return cor_mat
    

    def __check__(self,cluster_x,cluster_y):
        k = len(cluster_x)
        for i in range(len(cluster_x)):
            if Counter(cluster_x[i]) != Counter(cluster_y[i]):
                return False    
        return True
        
    
    def kmeans(self,k,iters=100):
        cluster_x = [[] for x in range(k)]
        cluster_y = [[] for y in range(k)]
        temp = [[] for y in range(k)]
        
        
        init = self.__get_init_dist__(k)
        
        for i in range(len(init)):
            cluster_x[init[i]].append(i)
            
        means = np.zeros((k,len(self.postings["idf"])))
        cor_mat = np.zeros((k,len(self.postings["doc_names"])))
        
        for it in range(iters):
            
            means = self.__get_means__(k,cluster_x)
            cor_mat = self.__get_cor_mat__(means)
            
            mins = np.argmin(cor_mat,axis=0)
            
            for i in range(len(mins)):
                cluster_y[mins[i]].append(i)
                
            if not self.__check__(cluster_x,cluster_y):
                cluster_x = cluster_y
                cluster_y = temp
                continue
            
            break
        
        print("Size of each cluster")
        for m in range(k):
            print(m,len(cluster_y[m]))
            print()
            for i in range(len(cluster_y[m])):
                cluster_y[m][i] = self.postings["doc_names"][cluster_y[m][i]]
            print(cluster_y[m])
            print()
        
#         print("Docs in each cluster")
#         print(cluster_y)
#         return cluster_y           

In [42]:
tc = TextClustering(postings)

In [44]:
tc.kmeans(20)

Size of each cluster
0 105

['38346', '38767', '38995', '38741', '38994', '39057', '39674', '38723', '39656', '38832', '38968', '38459', '39643', '38386', '38308', '38819', '39058', '38939', '39063', '38702', '39624', '40027', '38925', '38897', '38764', '37952', '38762', '38570', '38757', '38571', '38276', '38501', '37959', '38385', '38867', '38611', '38877', '39662', '39738', '38758', '38977', '38296', '38334', '38596', '38992', '37941', '39654', '38768', '38970', '38940', '38893', '38919', '38691', '38346', '38767', '38995', '38741', '38994', '39057', '39674', '38723', '38947', '39656', '38832', '38968', '39051', '38459', '39643', '38386', '38308', '38819', '39058', '38939', '39063', '38702', '39624', '40027', '38925', '38897', '38764', '37952', '39029', '38757', '38571', '38276', '38501', '37959', '38385', '38867', '38611', '38877', '39662', '39738', '38977', '38296', '38334', '38596', '38992', '39654', '38768', '38970', '38940', '38893', '38919', '38691']

1 78

['39653', '39042', 