# Clutering Example

In [2]:
import string, nltk
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.cluster import KMeans
import pandas as pd

collection = [
   "There were a king with a large jaw and a queen with a plain face, on the throne of England; there were a king with a large jaw and a queen with a fair face, on the throne of France.",
    "In both countries it was clearer than crystal to the lords of the State preserves of loaves and fishes, that things in general were settled for ever.",
    "It was the year of Our Lord one thousand seven hundred and seventy-five.",
    "Spiritual revelations were conceded to England at that favoured period, as at this.",
    "Mrs. Southcott had recently attained her five-and-twentieth blessed birthday, of whom a prophetic private in the Life Guards had heralded the sublime appearance by announcing that arrangements were made for the swallowing up of London and Westminster.",
    "Even the Cock-lane ghost had been laid only a round dozen of years, after rapping out its messages, as the spirits of this very year last past (supernaturally deficient in originality) rapped out theirs.", 
    "Mere messages in the earthly order of events had lately come to the English Crown and People, from a congress of British subjects in America: which, strange to relate, have proved more important to the human race than any communications yet received through any of the chickens of the Cock-lane brood.",
    "France, less favoured on the whole as to matters spiritual than her sister of the shield and trident, rolled with exceeding smoothness down hill, making paper money and spending it.",
    "Under the guidance of her Christian pastors, she entertained herself, besides, with such humane achievements as sentencing a youth to have his hands cut off, his tongue torn out with pincers, and his body burned alive, because he had not kneeled down in the rain to do honour to a dirty procession of monks which passed within his view, at a distance of some fifty or sixty yards.",
    "It is likely enough that, rooted in the woods of France and Norway, there were growing trees, when that sufferer was put to death, already marked by the Woodman, Fate, to come down and be sawn into boards, to make a certain movable framework with a sack and a knife in it, terrible in history.",
    "It is likely enough that in the rough outhouses of some tillers of the heavy lands adjacent to Paris, there were sheltered from the weather that very day, rude carts, bespattered with rustic mire, snuffed about by pigs, and roosted in by poultry, which the Farmer, Death, had already set apart to be his tumbrils of the Revolution.",
    "But that Woodman and that Farmer, though they work unceasingly, work silently, and no one heard them as they went about with muffled tread: the rather, forasmuch as to entertain any suspicion that they were awake, was to be atheistical and traitorous.",
]

# removing punctuation
coll_nopunct = [sent.translate(str.maketrans('', '', string.punctuation)) for sent in collection]

# lowercasing 
coll_lower = [sent.lower() for sent in coll_nopunct]

# tokenizing
coll_ready = [' '.join(word_tokenize(sent)) for sent in coll_lower]

# show the preprocessed collection matrix
for sentence in coll_ready:
    print(sentence)
print("\n")

there were a king with a large jaw and a queen with a plain face on the throne of england there were a king with a large jaw and a queen with a fair face on the throne of france
in both countries it was clearer than crystal to the lords of the state preserves of loaves and fishes that things in general were settled for ever
it was the year of our lord one thousand seven hundred and seventyfive
spiritual revelations were conceded to england at that favoured period as at this
mrs southcott had recently attained her fiveandtwentieth blessed birthday of whom a prophetic private in the life guards had heralded the sublime appearance by announcing that arrangements were made for the swallowing up of london and westminster
even the cocklane ghost had been laid only a round dozen of years after rapping out its messages as the spirits of this very year last past supernaturally deficient in originality rapped out theirs
mere messages in the earthly order of events had lately come to the english 

In [3]:
# printing the collection vocabulary
vec_count = CountVectorizer()
X_count = vec_count.fit_transform(coll_ready)
print("Collection vocabulary:\n", vec_count.get_feature_names(), "\n")
print("Collection vocabulary size: ", len(vec_count.get_feature_names()), " terms.\n")

Collection vocabulary:
 ['about', 'achievements', 'adjacent', 'after', 'alive', 'already', 'america', 'and', 'announcing', 'any', 'apart', 'appearance', 'arrangements', 'as', 'at', 'atheistical', 'attained', 'awake', 'be', 'because', 'been', 'besides', 'bespattered', 'birthday', 'blessed', 'boards', 'body', 'both', 'british', 'brood', 'burned', 'but', 'by', 'carts', 'certain', 'chickens', 'christian', 'clearer', 'cocklane', 'come', 'communications', 'conceded', 'congress', 'countries', 'crown', 'crystal', 'cut', 'day', 'death', 'deficient', 'dirty', 'distance', 'do', 'down', 'dozen', 'earthly', 'england', 'english', 'enough', 'entertain', 'entertained', 'even', 'events', 'ever', 'exceeding', 'face', 'fair', 'farmer', 'fate', 'favoured', 'fifty', 'fishes', 'fiveandtwentieth', 'for', 'forasmuch', 'framework', 'france', 'from', 'general', 'ghost', 'growing', 'guards', 'guidance', 'had', 'hands', 'have', 'he', 'heard', 'heavy', 'her', 'heralded', 'herself', 'hill', 'his', 'history', 'honou

In [6]:
# printing tf vectorizer matrix
X_tf = vec_count.fit_transform(coll_ready)

# printing tf-idf vectorizer matrix
vec_tfidf = TfidfTransformer()
X_tfidf = vec_tfidf.fit_transform(X_tf)
print("Term frequency - inverse document frequency vectorizer matrix:\n", X_tfidf.toarray())
X_tfidf.shape

Term frequency - inverse document frequency vectorizer matrix:
 [[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.12812925 0.         0.14919366 ... 0.         0.         0.        ]
 [0.13215149 0.         0.         ... 0.         0.         0.        ]]


(12, 260)

In [5]:
# create the clusters with k-means
n_cl = 6
model = KMeans(n_clusters=n_cl).fit(X_tfidf)
labels = model.labels_
sent_cl = pd.DataFrame(list(zip(collection,labels)),columns=['sentence', 'cluster'])
print(sent_cl.sort_values(by=['cluster']))

                                             sentence  cluster
5   Even the Cock-lane ghost had been laid only a ...        0
6   Mere messages in the earthly order of events h...        0
8   Under the guidance of her Christian pastors, s...        0
1   In both countries it was clearer than crystal ...        1
9   It is likely enough that, rooted in the woods ...        1
10  It is likely enough that in the rough outhouse...        1
11  But that Woodman and that Farmer, though they ...        1
3   Spiritual revelations were conceded to England...        2
0   There were a king with a large jaw and a queen...        3
7   France, less favoured on the whole as to matte...        3
2   It was the year of Our Lord one thousand seven...        4
4   Mrs. Southcott had recently attained her five-...        5
