In [None]:
## Topic Modelling of DL Definitions

In [44]:
import pandas as pd
from gensim import corpora, models, similarities
import bleach
from nltk.corpus import stopwords
from nltk import ConditionalFreqDist, FreqDist
from nltk.stem.snowball import SnowballStemmer
import nltk
import gexf as g
import collections
import os.path
import re

# Fix invalid display error
%matplotlib inline


In [74]:
indirname = "/home/paulus/docs/datalit/"
paperlist = []
titlelist = []
paperdict = dict()
   
for fn in os.listdir(indirname ):
    if fn.endswith(".txt"):
       
        hdl = open(indirname + fn)
        txt = hdl.read()
        paperlist.append(txt)
        
        index_of_dot = fn.index('.')
        fns = fn[:index_of_dot].replace(",","")
        print ('adding - ' + fns)
        titlelist.append(fns)
        paperdict[fns]=dict()
        paperdict[fns]["text"] = txt


adding - Carlson et al
adding - Fonticiaro Oehrli - 2016 - Why data literacy matters
adding - blog - sod - defining dl
adding - Teal et al
adding - Determining Data Information Literacy Needs- A Study of Students
adding - Erwin - 2015 - Data Literacy Real-World Learning Through Problem-Solving With Data Sets
adding - Data-Pop Alliance - 2015 - Beyond Data Literacy Reinventing Community Engagement and Empowerment in the Age of Data
adding - blog- Advancing Data Literacy
adding - Martin - 2014 - What Is Data Literacy
adding - Mandinach Gummer - 2013 - A Systemic View of Implementing Data Literacy in Educator Preparation
adding - Koltay - 2015 - Data literacy in search of a name and identity
adding - Anderson et al
adding - Federer Lu Joubert - 2016 - Data literacy training needs of biomedical researchers


In [4]:

# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

# load nltk's SnowballStemmer as variabled 'stemmer'

stemmer = SnowballStemmer("english")

In [16]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


In [65]:
#not super pythonic, no, not at all.
#use extend so it's a big flat list of vocab
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in paperlist:
    i = i.decode('utf-8')
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [67]:

vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
print ('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

there are 69153 items in vocab_frame


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(paperlist) #fit the vectorizer to synopses

print(tfidf_matrix.shape)

CPU times: user 3.64 s, sys: 76.6 ms, total: 3.72 s
Wall time: 3.66 s
(13, 2227)


In [69]:
terms = tfidf_vectorizer.get_feature_names()
#print terms

In [60]:
from sklearn.cluster import KMeans

num_clusters = 4

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

CPU times: user 80.1 ms, sys: 0 ns, total: 80.1 ms
Wall time: 79.5 ms


In [70]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
#clusters = km.labels_.tolist()

['doc_cluster.pkl', 'doc_cluster.pkl_01.npy', 'doc_cluster.pkl_02.npy']

In [75]:
arts = { 'title': titlelist, 'text': paperlist, 'cluster': clusters }

frame = pd.DataFrame(arts, index = [clusters] , columns = ['title', 'cluster'])
frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

3    7
1    3
0    2
2    1
dtype: int64

In [76]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    
    for ind in order_centroids[i, :25]: #replace 6 with n words per cluster
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print() #add whitespace
    print() #add whitespace
    
    print("Cluster %d titles:" % i, end='')
    print(frame.ix[i]['title'])
    #for title in frame.ix[i]['title'].value.tolist():
        #print(' %s,' % title, end='')
    print() #add whitespace
    print() #add whitespace
    
print()
print()

Top terms per cluster:

Cluster 0 words: teacher, decisions, mandinach, decisions, data-driven, et, al., data-driven, data-driven, course, state, parent, et, prepared, know, students, gummer, policy, s, west, training, privacy, performance, ethics, september,

Cluster 0 titles:0    Mandinach Gummer - 2013 - A Systemic View of I...
0                                       Anderson et al
Name: title, dtype: object


Cluster 1 words: 's, n't, social, participating, design, inclusion, stories, narrative, journals, big, contextualize, big, mentioned, able, term, humanities, individuals, revolution, journalistic, visualization, concepts, promote, aims, defined, definitions,

Cluster 1 titles:1                             blog - sod - defining dl
1    Data-Pop Alliance - 2015 - Beyond Data Literac...
1                        blog- Advancing Data Literacy
Name: title, dtype: object


Cluster 2 words: workshops, training, instructors, software, lessons, materials, participating, surveys, taught,

In [59]:
print (frame.ix[i]['title'])

0    Koltay - 2015 - Data literacy in search of a n...
0                                       Anderson et al
0    Data-Pop Alliance - 2015 - Beyond Data Literac...
0                        blog- Advancing Data Literacy
0    Mandinach Gummer - 2013 - A Systemic View of I...
0    Determining Data Information Literacy Needs- A...
0                                           Teal et al
Name: title, dtype: object
