In [3]:
# Importing Packages

import nltk #Tokenizing
import re #Preprocessing

# Importing classes within the packages

from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords #Don't consider stopwords in histogram
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk import FreqDist
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

In [9]:
# We specify the directory where the transcripts are
file_directory = 'data/transcripts'
 
# We include those filenames ending with '.txt'.
filename_pattern = '.+\.txt'
my_corpus = PlaintextCorpusReader(file_directory, filename_pattern)

In [10]:
# Extract sentences

sentences = my_corpus.sents('19_08_09.txt')
sent_list=[] # A list of all sentences in the transcript

for key1 in sentences:
    s=""
    for key2 in key1:
        s=s+key2+" "
    sent_list.append(s)

In [15]:
# Clustering- taking all sentences in the list

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(sent_list)

true_k = 5 # can change this value according to how many clusters are needed
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("\nCluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:

Cluster 0:
 constitution
 ideology
 blacks
 basis
 trying
 malays
 black
 aspirations
 nation
 say

Cluster 1:
 aspiration
 years
 failed
 evident
 ex
 exceptional
 explicitly
 extra
 fail
 false

Cluster 2:
 field
 level
 going
 reach
 playing
 centuries
 decades
 trying
 everybody
 position

Cluster 3:
 recognise
 starting
 base
 started
 foundations
 remind
 point
 earth
 thought
 fail

Cluster 4:
 endowed
 hold
 truths
 evident
 created
 pursuit
 inalienable
 happiness
 creator
 certain


In [18]:
# Extracting lemmatized words to do a different cluster analysis
# Same procedure as NLP File

# Reading the text file

data=''
with open('data/transcripts/19_08_09.txt','r') as file:
    for line in file:
        s = str(line).strip()
        data = data + s + ' '
        
data = data.lower() # Contains all the data in the transcript as a string
data_split = data.split(" ")

# Removing punctuations and only considering words

pattern = r"\w+"
reg_split = re.findall(pattern,data) # A list of all words in the transcript 

# Tokenizing all the data

tokenizer = RegexpTokenizer(r'\w+')
data_tokens = tokenizer.tokenize(data)

# Now we remove stopwords from the data

stop_words = stopwords.words('english')

clean_data = []

for w in data_tokens:
    if w not in stop_words:
        clean_data.append(w)        

# We stem the data using Porter Stemmer

stemmed_data=[]
stemmer = PorterStemmer()

for w in clean_data:
    stemmed_data.append(stemmer.stem(w))

# Attach Parts of Speech to each word belonging to the clean data

pos_data = pos_tag(clean_data)

# We simplify the Parts of Speech tags
# We want to convert NNS to n and VBD to v

data_output=[]

for w in pos_data:
    data_output.append((w[0], w[1][0].lower()))

# We use a Word Lemmatizer
lemmatizer = WordNetLemmatizer()
lemmatized_data=[]

for w in clean_data: 
    lemmatized_data.append(lemmatizer.lemmatize(w))

# We combine the POS Tags and Lemmatized data

lemma_output_data = []
lemma_words=[]

for w in data_output:
    word = w[0]
    pos_word = w[1]
    lemma = word
    if pos_word in ['a', 's', 'r', 'n', 'v']:
        lemma = lemmatizer.lemmatize(word, pos=pos_word)
        lemma_words.append(lemma)
    
    #return (word, pos_tag, lemma)
    lemma_output_data.append((word,pos_word,lemma))

In [20]:
# Clustering- on the lemmatized words

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(lemma_words)

true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("\nCluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

Top terms per cluster:

Cluster 0:
 malay
 constitution
 make
 treat
 aspiration
 position
 singapore
 try
 government
 read

Cluster 1:
 way
 year
 fee
 govern
 gi
 ghetto
 fund
 foundation
 forward
 flaw

Cluster 2:
 right
 year
 fee
 govern
 gi
 ghetto
 fund
 foundation
 forward
 flaw

Cluster 3:
 black
 year
 fight
 government
 govern
 gi
 ghetto
 fund
 foundation
 forward

Cluster 4:
 place
 year
 field
 govern
 gi
 ghetto
 fund
 foundation
 forward
 flaw
