#### Dataset 

In [80]:
# Dataset: bbc-news-data.txt
# Location: https://mitu.co.in/dataset

#### Import necessary libraries

In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Read dataset

In [82]:
file = open('/home/mitu/bbc-text-data.csv')

In [83]:
data = file.readlines()

In [84]:
len(data)

2225

#### Feature Extraction

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [86]:
tfidf = TfidfVectorizer(stop_words='english')

In [87]:
x_vect = tfidf.fit_transform(data)

In [88]:
x_vect.shape

(2225, 29126)

#### Build the model

In [105]:
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import linkage

In [115]:
km = AgglomerativeClustering(n_clusters=5)

In [116]:
labels = km.fit_predict(x_vect.toarray())

In [117]:
labels

array([4, 0, 0, ..., 0, 0, 0])

In [118]:
km.labels_

array([4, 0, 0, ..., 0, 0, 0])

In [119]:
len(labels)

2225

#### Extract the clusters

In [124]:
pd.Series(labels).value_counts().to_dict()

{0: 1121, 4: 368, 2: 365, 1: 225, 3: 146}

In [125]:
labels == 3

array([False, False, False, ..., False, False, False])

In [120]:
df = pd.DataFrame(data, columns=['text'])

In [121]:
print('0:', len(df[labels == 0]))
print('1:', len(df[labels == 1]))
print('2:', len(df[labels == 2]))
print('3:', len(df[labels == 3]))
print('4:', len(df[labels == 4]))

0: 1121
1: 225
2: 365
3: 146
4: 368


In [98]:
from nltk import FreqDist
from nltk.corpus import stopwords

In [99]:
two = df[labels == 2]['text']
two_data = list(two.values)
words = []
for sent in two_data:
    words.extend(sent.split())

words = [word for word in words 
         if word not in stopwords.words('english')
        and word.isalpha()]

freq = FreqDist(words)
freq.most_common(10)

[('said', 373),
 ('first', 356),
 ('england', 344),
 ('win', 308),
 ('last', 306),
 ('game', 302),
 ('world', 292),
 ('two', 275),
 ('one', 264),
 ('would', 251)]

In [100]:
two = df[labels == 0]['text']
two_data = list(two.values)
words = []
for sent in two_data:
    words.extend(sent.split())

words = [word for word in words 
         if word not in stopwords.words('english')
        and word.isalpha()]

freq = FreqDist(words)
freq.most_common(10)

[('said', 2550),
 ('mr', 1237),
 ('would', 1202),
 ('us', 1137),
 ('also', 1015),
 ('new', 920),
 ('year', 792),
 ('one', 769),
 ('people', 728),
 ('last', 704)]

In [101]:
two = df[labels == 1]['text']
two_data = list(two.values)
words = []
for sent in two_data:
    words.extend(sent.split())

words = [word for word in words 
         if word not in stopwords.words('english')
        and word.isalpha()]

freq = FreqDist(words)
freq.most_common(10)

[('mr', 1244),
 ('said', 851),
 ('would', 630),
 ('labour', 603),
 ('blair', 449),
 ('party', 400),
 ('election', 370),
 ('government', 330),
 ('brown', 312),
 ('people', 294)]

In [102]:
two = df[labels == 3]['text']
two_data = list(two.values)
words = []
for sent in two_data:
    words.extend(sent.split())

words = [word for word in words 
         if word not in stopwords.words('english')
        and word.isalpha()]

freq = FreqDist(words)
freq.most_common(10)

[('film', 555),
 ('best', 385),
 ('director', 162),
 ('said', 162),
 ('also', 151),
 ('awards', 151),
 ('award', 139),
 ('year', 138),
 ('films', 130),
 ('actor', 126)]

In [103]:
two = df[labels == 4]['text']
two_data = list(two.values)
words = []
for sent in two_data:
    words.extend(sent.split())

words = [word for word in words 
         if word not in stopwords.words('english')
        and word.isalpha()]

freq = FreqDist(words)
freq.most_common(10)

[('said', 1136),
 ('people', 828),
 ('also', 492),
 ('new', 480),
 ('mobile', 446),
 ('mr', 443),
 ('would', 429),
 ('technology', 416),
 ('could', 410),
 ('one', 403)]