# Purpose

I am interested in exploring singular value decomposition with abstracts (as paragraphs) rather than just using all words together. I'll be comparing how this topic model performs compared to other topic models.

In [9]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

In [3]:
def display_topics(model, features, no_top_words=10):
    for topic, word_vector in enumerate(model.components_):
        total = word_vector.sum()
        largest = word_vector.argsort()[::-1] # invert sort order
        print("\nTopic %02d" % topic)
        for i in range(0, no_top_words):
            print(" %s (%2.2f)" % (features[largest[i]],
                word_vector[largest[i]]*100.0/total))

In [4]:
data = pd.read_csv("Combined_WoS_Dataset.csv",usecols=['Abstract', 'First_A_Country'])

In [23]:
clean_data=data.dropna()
len(clean_data)

49436

In [24]:
abstracts=clean_data['Abstract']

In [25]:
abstract_strings=[] ##creating a list to store string of data

for item in abstracts:  #Make everything in the column  a string of text
    string=str(item)
    abstract_strings.append(string)
    
len(abstract_strings)

49436

In [26]:
tfidf_para_vectorizer = TfidfVectorizer(stop_words=stopwords, min_df=5,
                                        max_df=0.8)

In [27]:
tfidf_para_vectors = tfidf_para_vectorizer.fit_transform(abstract_strings)

tfidf_para_vectors.shape

(49436, 33887)

In [16]:
svd_para_model = TruncatedSVD(n_components = 10, random_state=42)
W_svd_para_matrix = svd_para_model.fit_transform(tfidf_para_vectors)
H_svd_para_matrix = svd_para_model.components_

In [17]:
display_topics(svd_para_model, tfidf_para_vectorizer.get_feature_names())


Topic 00
 species (0.48)
 marine (0.26)
 conservation (0.23)
 habitat (0.21)
 population (0.21)
 data (0.20)
 management (0.19)
 areas (0.19)
 fish (0.18)
 study (0.18)

Topic 01
 nan (100.65)
 edna (0.01)
 cultivation (0.01)
 seaweed (0.01)
 peafowl (0.01)
 core (0.01)
 ao (0.01)
 op (0.01)
 sites (0.01)
 pom (0.01)

Topic 02
 conservation (-5.81)
 wildlife (-5.35)
 management (-4.00)
 human (-3.19)
 areas (-3.19)
 protected (-2.59)
 social (-2.39)
 habitat (-2.25)
 forest (-2.21)
 use (-1.96)

Topic 03
 genetic (-47.21)
 population (-45.57)
 species (-31.16)
 populations (-30.41)
 breeding (-16.12)
 individuals (-15.68)
 habitat (-15.45)
 survival (-12.59)
 females (-12.41)
 wild (-11.84)

Topic 04
 genetic (-9.80)
 species (-5.84)
 diversity (-5.41)
 gene (-3.09)
 marine (-2.77)
 genome (-2.49)
 conservation (-2.48)
 genes (-2.19)
 structure (-2.15)
 biodiversity (-2.13)

Topic 05
 species (-2.35)
 coral (-1.61)
 habitat (-1.51)
 reef (-1.28)
 reefs (-0.90)
 fish (-0.88)
 forest (-

In [18]:
svd_para_model.singular_values_

array([31.67432513, 22.93541187, 14.28499425, 13.68229479, 12.53211724,
       11.93828144, 11.81406013, 11.48699761, 11.02141734, 10.77998544])

Currently, I'm not as impressed with these categories compared to NMF. I'll try 18 as well, which was the best topic number when using words with LDA.

In [19]:
svd_para_model_18 = TruncatedSVD(n_components = 18, random_state=42)
W_svd_para_matrix_18 = svd_para_model_18.fit_transform(tfidf_para_vectors)
H_svd_para_matrix_18 = svd_para_model_18.components_

In [20]:
display_topics(svd_para_model_18, tfidf_para_vectorizer.get_feature_names())


Topic 00
 species (0.48)
 marine (0.26)
 conservation (0.23)
 habitat (0.21)
 population (0.21)
 data (0.20)
 management (0.19)
 areas (0.19)
 fish (0.18)
 study (0.18)

Topic 01
 nan (100.65)
 edna (0.01)
 cultivation (0.01)
 seaweed (0.01)
 peafowl (0.01)
 core (0.01)
 ao (0.01)
 op (0.01)
 sites (0.01)
 pom (0.01)

Topic 02
 conservation (-5.80)
 wildlife (-5.35)
 management (-4.01)
 human (-3.19)
 areas (-3.18)
 protected (-2.59)
 social (-2.40)
 habitat (-2.24)
 forest (-2.22)
 use (-1.95)

Topic 03
 genetic (-44.90)
 population (-43.40)
 species (-29.59)
 populations (-28.95)
 breeding (-15.33)
 individuals (-14.92)
 habitat (-14.68)
 survival (-11.96)
 females (-11.81)
 wild (-11.25)

Topic 04
 genetic (-9.57)
 species (-5.72)
 diversity (-5.32)
 gene (-3.00)
 marine (-2.71)
 conservation (-2.43)
 genome (-2.41)
 genes (-2.12)
 biodiversity (-2.12)
 structure (-2.10)

Topic 05
 species (-2.38)
 coral (-1.63)
 habitat (-1.53)
 reef (-1.28)
 reefs (-0.91)
 fish (-0.88)
 forest (-

In [22]:
svd_para_model_18.singular_values_

array([31.67432513, 22.93541188, 14.28505385, 13.68230681, 12.53262593,
       11.94018424, 11.81541104, 11.49339904, 11.02264869, 10.78880889,
       10.30749332, 10.01089895,  9.96859837,  9.62161365,  9.3166644 ,
        9.19825415,  8.86443729,  8.81026715])

## NMF's 21 topics

In [28]:
svd_para_model_21 = TruncatedSVD(n_components = 21, random_state=42)
W_svd_para_matrix_21 = svd_para_model_21.fit_transform(tfidf_para_vectors)
H_svd_para_matrix_21 = svd_para_model_21.components_

In [29]:
display_topics(svd_para_model_21, tfidf_para_vectorizer.get_feature_names())


Topic 00
 species (0.48)
 marine (0.26)
 conservation (0.23)
 habitat (0.21)
 population (0.21)
 data (0.20)
 management (0.19)
 areas (0.19)
 fish (0.18)
 study (0.18)

Topic 01
 conservation (-5.77)
 wildlife (-5.34)
 management (-4.00)
 human (-3.18)
 areas (-3.17)
 protected (-2.58)
 social (-2.40)
 habitat (-2.24)
 forest (-2.21)
 use (-1.95)

Topic 02
 genetic (-51.45)
 population (-49.46)
 species (-33.29)
 populations (-33.04)
 breeding (-17.59)
 individuals (-17.05)
 habitat (-16.70)
 survival (-13.68)
 females (-13.55)
 wild (-12.88)

Topic 03
 genetic (-9.86)
 species (-5.73)
 diversity (-5.43)
 gene (-3.08)
 marine (-2.78)
 genome (-2.48)
 conservation (-2.46)
 genes (-2.18)
 structure (-2.17)
 biodiversity (-2.13)

Topic 04
 species (-2.27)
 coral (-1.67)
 habitat (-1.49)
 reef (-1.32)
 reefs (-0.93)
 fish (-0.87)
 forest (-0.75)
 richness (-0.75)
 cover (-0.68)
 habitats (-0.63)

Topic 05
 wildlife (2.02)
 species (1.54)
 forest (1.21)
 human (1.12)
 microbial (1.01)
 ho

In [30]:
svd_para_model.singular_values_

array([31.67432513, 22.93541187, 14.28499425, 13.68229479, 12.53211724,
       11.93828144, 11.81406013, 11.48699761, 11.02141734, 10.77998544])