# load text data

In [1]:
import pandas as pd

In [2]:
#!pip install gdown
!gdown https://drive.google.com/uc?id=1EMzJxxoBaN_NbvF7xhoc09K82vQ6H_LX

Downloading...
From: https://drive.google.com/uc?id=1EMzJxxoBaN_NbvF7xhoc09K82vQ6H_LX
To: /content/content.xlsx
  0% 0.00/14.6k [00:00<?, ?B/s]100% 14.6k/14.6k [00:00<00:00, 39.8MB/s]


In [3]:
fp = "content.xlsx"
df = pd.read_excel(fp, header=None)
df.head()

Unnamed: 0,0,1,2
0,110,https://imagebank.sweden.se//deployedFiles/c7f...,Åre is one of Swedens largest and most popular...
1,139,https://imagebank.sweden.se//deployedFiles/fb2...,Låkktatjåkka is Swedens highest situated mount...
2,2178,https://imagebank.sweden.se//deployedFiles/7e2...,Sweden is a nation of amateur athletes and spo...
3,2574,https://imagebank.sweden.se//deployedFiles/03f...,"With four slopes and two lifts, Hammarbybacken..."
4,2575,https://imagebank.sweden.se//deployedFiles/0f8...,"With four slopes and two lifts, Hammarbybacken..."


In [4]:
df.shape

(36, 3)

In [2]:
corpus = ["Rafael Nadal Joins Roger Federer in Missing U.S. Open",
          "Rafael Nadal Is Out of the Australian Open",
          "Biden Announces Virus Measures",
          "Biden's Virus Plans Meet Reality",
          "Where Biden's Virus Plan Stands"]

In [None]:
# Download necessary NLTK data
import nltk
nltk.download('popular')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [4]:
# define data cleaning function
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]

In [5]:
clean_corpus

[['rafael', 'nadal', 'join', 'roger', 'federer', 'missing', 'u', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['bidens', 'virus', 'plan', 'meet', 'reality'],
 ['bidens', 'virus', 'plan', 'stand']]

# vector, matrix representation

In [6]:
# import numpy for matrix operation
import numpy as np
#!pip install -q scikit-learn
from sklearn.feature_extraction.text import CountVectorizer


In [7]:
# Converting text into numerical representation
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)
# Array from Count Vectorizer
cv_arr = cv_vectorizer.fit_transform(clean_corpus)
# this is our converted text to numerical representation from the Count vectorizer
cv_arr



<5x18 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [9]:
cv_arr.todense()

matrix([[0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0],
        [0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0],
        [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1],
        [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1]])

In [8]:
# Creating vocabulary array which will represent all the corpus
vocab_cv = cv_vectorizer.get_feature_names_out()
# get the vocb list
vocab_cv

array(['announces', 'australian', 'biden', 'bidens', 'federer', 'join',
       'measure', 'meet', 'missing', 'nadal', 'open', 'plan', 'rafael',
       'reality', 'roger', 'stand', 'u', 'virus'], dtype=object)

In [10]:
display(len(vocab_cv))

18

# tf-idf measure

In [11]:
# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
# Converting text into numerical representation
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)
# Array from TF-IDF Vectorizer
tf_idf_arr = tf_idf_vectorizer.fit_transform(clean_corpus)
# this is our converted text to numerical representation from the Tf-IDF vectorizer
tf_idf_arr

<5x18 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [13]:
tf_idf_arr.todense()

matrix([[0.        , 0.        , 0.        , 0.        , 0.37924665,
         0.37924665, 0.        , 0.        , 0.37924665, 0.30597381,
         0.30597381, 0.        , 0.30597381, 0.        , 0.37924665,
         0.        , 0.37924665, 0.        ],
        [0.        , 0.5819515 , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.4695148 ,
         0.4695148 , 0.        , 0.4695148 , 0.        , 0.        ,
         0.        , 0.        , 0.        ],
        [0.53849791, 0.        , 0.53849791, 0.        , 0.        ,
         0.        , 0.53849791, 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        , 0.        , 0.36063833],
        [0.        , 0.        , 0.        , 0.41660727, 0.        ,
         0.        , 0.        , 0.51637397, 0.        , 0.        ,
         0.        , 0.41660727, 0.        , 0.51637397, 0.        ,
         0.        , 0.        , 0

In [14]:
# Creating vocabulary array which will represent all the corpus
vocab_tf_idf = tf_idf_vectorizer.get_feature_names_out()
# get the vocb list
vocab_tf_idf

array(['announces', 'australian', 'biden', 'bidens', 'federer', 'join',
       'measure', 'meet', 'missing', 'nadal', 'open', 'plan', 'rafael',
       'reality', 'roger', 'stand', 'u', 'virus'], dtype=object)

In [15]:
display(len(vocab_tf_idf))

18

# topic modelling

In [16]:
#
from sklearn.decomposition import LatentDirichletAllocation

In [17]:
# Create object for the LDA class
lda_model = LatentDirichletAllocation(n_components=2, max_iter=20, random_state=20)

# fit transform on model on our count_vectorizer : running this will return our topics
X_topics = lda_model.fit_transform(tf_idf_arr)

# .components_ gives us our topic distribution
topic_words = lda_model.components_

In [22]:
X_topics

array([[0.8529969 , 0.1470031 ],
       [0.81979113, 0.18020887],
       [0.76388102, 0.23611898],
       [0.16612779, 0.83387221],
       [0.17823699, 0.82176301]])

In [18]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 5

for i, topic_dist in enumerate(topic_words):

    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)

    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]

    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['rafael' 'open' 'nadal' 'australian']
Topic 2 ['bidens' 'plan' 'virus' 'stand']


In [19]:
# To view what topics are assigned to the douments:
doc_topic = lda_model.transform(tf_idf_arr)

# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):

    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()

    # document is n+1
    print ("Document", n+1, " -- Topic:", topic_doc)

Document 1  -- Topic: 0
Document 2  -- Topic: 0
Document 3  -- Topic: 0
Document 4  -- Topic: 1
Document 5  -- Topic: 1


In [21]:
clean_corpus

[['rafael', 'nadal', 'join', 'roger', 'federer', 'missing', 'u', 'open'],
 ['rafael', 'nadal', 'australian', 'open'],
 ['biden', 'announces', 'virus', 'measure'],
 ['bidens', 'virus', 'plan', 'meet', 'reality'],
 ['bidens', 'virus', 'plan', 'stand']]

# topic modelling with gensim and pyldavis
- [https://nils-holmberg.github.io/cca-nlp/jnb/scom-gpols-topics.html](https://nils-holmberg.github.io/cca-nlp/jnb/scom-gpols-topics.html)