In [1]:
import pickle
data = pickle.load(open("data/data.plk", "rb"))

In [2]:
len(data)

4351

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords


wordnet_lemmatizer = WordNetLemmatizer()

In [4]:
def get_lemmatized_words(sentence):
    _temp = []
    tokens = nltk.word_tokenize(sentence)
    lmtzr = WordNetLemmatizer()
    
    for word, token in nltk.pos_tag(tokens):
        if word.strip().lower() not in stopwords.words('english') and word.strip().lower().isalnum() and len(word.strip().lower()) > 1:
            _temp.append(lmtzr.lemmatize(word.strip().lower(), get_wordnet_pos(token)))
    return _temp

In [5]:
def get_wordnet_pos(treebank_tag):

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
[" ".join(get_lemmatized_words(x["název (anglicky)"])) for x in data[:2] if len(x["název (anglicky)"]) > 2 and len(x["katedra"]) > 1]

['salzella declarative language music generation',
 'application illustrative explode view 3d model']

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
names_list = [" ".join(get_lemmatized_words(x["název (anglicky)"])) for x in data if len(x["název (anglicky)"]) > 2 and len(x["katedra"]) > 1]

In [15]:
vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,3), min_df=1) # min_df if int then it is absolute value
X = vectorizer.fit_transform(names_list)
    

In [16]:
X[0]

<1x25169 sparse matrix of type '<class 'numpy.float64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [17]:
print (X[0])

  (0, 18635)	0.309093230645
  (0, 5759)	0.309093230645
  (0, 12116)	0.200783367841
  (0, 14550)	0.229641356648
  (0, 9109)	0.217032853369
  (0, 18636)	0.309093230645
  (0, 5760)	0.309093230645
  (0, 12147)	0.309093230645
  (0, 14555)	0.309093230645
  (0, 18637)	0.309093230645
  (0, 5761)	0.309093230645
  (0, 12148)	0.309093230645


In [19]:
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=20, n_iter=100)
lsa.fit(X)

TruncatedSVD(algorithm='randomized', n_components=20, n_iter=100,
       random_state=None, tol=0.0)

**LSA**

Input: X, a matrix where m is the number of documents I have, and n is the number of terms.
Process: I'm going to decompose X into three matricies called U, S, and T. When we do the decomposition, we have to pick a value k, that's how many concepts we are going to keep.
$$X \approx USV^{T}$$
U will be a m x k matrix. The rows will be documents and the columns will be 'concepts'
S will be a k x k diagnal matrix. The elements will be the amount of variation captured from each concept.
V will be a n x k (mind the transpose) matrix. The rows will be terms and the columns will be conepts.

In [21]:
lsa.components_[0].shape

(25169,)

In [22]:
terms = vectorizer.get_feature_names()
for i, comp in enumerate(lsa.components_): 
    termsInComp = zip (terms,comp)
    sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
    print("Concept %d:" % i )
    for term in sortedTerms:
        print(term[0])
    print (" ")

Concept 0:
system
information
information system
management
management system
web
project
support
content management
content management system
 
Concept 1:
management
web
application
management system
content
project
web application
content management
content management system
project management
 
Concept 2:
management
management system
content management
content management system
content
system
project management
project
document management
document
 
Concept 3:
network
implementation
data
computer
computer network
neural
neural network
design
user
user interface
 
Concept 4:
interface
user
user interface
graphical
graphical user
graphical user interface
design
system
web interface
interface design
 
Concept 5:
project
project management
management
application
software
support
tool
information
information system
management software
 
Concept 6:
implementation
data
application
design
system
design implementation
database
algorithm
mobile
compression
 
Concept 7:
implementation
web
port