# Topic Extraction

In [85]:
import os
import logging
from sklearn.feature_extraction.text import TfidfVectorizer

from time import time

logging.basicConfig(level=logging.DEBUG)

In [86]:
corpus_files = os.listdir('./state_union')

In [87]:
class CorpusDataLoader:
    
    def __init__(self):
        self.data_folder = './state_union/'
        pass
    
    def get_filenames(self):
        return sorted(os.listdir(self.data_folder))
    
    def read_all_speeches(self):
        self.corpus = []
        for i, speech in enumerate(self.get_filenames()):
            self.corpus.append(self.read_speech(speech))
            logging.info("Read {0}".format(speech))
    
    def read_speech(self, speech_filename):
        with open(self.data_folder + speech_filename, "rb") as file:
            text = file.read()
            if type(text) is not str:
                try:
                    text = text.decode("utf-8")
                except:
                    text = text.decode("iso-8859-1")
            return text

In [88]:
dataloader = CorpusDataLoader()

In [89]:
dataloader.read_all_speeches()

INFO:root:Read 1945-Truman.txt
INFO:root:Read 1946-Truman.txt
INFO:root:Read 1947-Truman.txt
INFO:root:Read 1948-Truman.txt
INFO:root:Read 1949-Truman.txt
INFO:root:Read 1950-Truman.txt
INFO:root:Read 1951-Truman.txt
INFO:root:Read 1953-Eisenhower.txt
INFO:root:Read 1954-Eisenhower.txt
INFO:root:Read 1955-Eisenhower.txt
INFO:root:Read 1956-Eisenhower.txt
INFO:root:Read 1957-Eisenhower.txt
INFO:root:Read 1958-Eisenhower.txt
INFO:root:Read 1959-Eisenhower.txt
INFO:root:Read 1960-Eisenhower.txt
INFO:root:Read 1961-Kennedy.txt
INFO:root:Read 1962-Kennedy.txt
INFO:root:Read 1963-Johnson.txt
INFO:root:Read 1963-Kennedy.txt
INFO:root:Read 1964-Johnson.txt
INFO:root:Read 1965-Johnson-1.txt
INFO:root:Read 1965-Johnson-2.txt
INFO:root:Read 1966-Johnson.txt
INFO:root:Read 1967-Johnson.txt
INFO:root:Read 1968-Johnson.txt
INFO:root:Read 1969-Johnson.txt
INFO:root:Read 1970-Nixon.txt
INFO:root:Read 1971-Nixon.txt
INFO:root:Read 1972-Nixon.txt
INFO:root:Read 1973-Nixon.txt
INFO:root:Read 1974-Nixon.t

In [100]:
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english',
                                 use_idf=True)

In [101]:
X = vectorizer.fit_transform(dataloader.corpus)
print("n_samples: %d, n_features: %d" % X.shape)

n_samples: 65, n_features: 7040


In [102]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=10, init='k-means++', max_iter=100, n_init=1,
                verbose=True)

In [103]:
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(X)
print("done in %0.3fs" % (time() - t0))

Clustering sparse data with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=10, n_init=1, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=True)
Initialization complete
Iteration  0, inertia 78.847
Iteration  1, inertia 41.690
Converged at iteration 1: center shift 0.000000e+00 within tolerance 1.228991e-08
done in 0.096s


In [104]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(10):
    print("Cluster %d:" % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind], end='')
    print()

Cluster 0: parents 21st pass college ought tell don got ll idea
Cluster 1: space alliance kennedy john dream nam viet divisions communist atlantic
Cluster 2: 1947 1946 veterans management expenditures 1945 adequate industrial bargaining collective
Cluster 3: oil salt strategic gulf israel foundation region conflict canal barrels
Cluster 4: peoples recommendations expenditures organization recommend communist general atomic field agriculture
Cluster 5: xand xa 1974 seventies property 92d localities xthe sixties truly
Cluster 6: applause terrorists terror iraq afghanistan terrorist qaeda al iraqi regime
Cluster 7: vietnam recommend try 1968 south abundance consumer 1966 communist 1967
Cluster 8: 1973 messages series outline lesson 1970s xall affairs notion credibility
Cluster 9: ll space deficits recovery regulations don 1982 dreams waste bipartisan
