In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from sklearn.decomposition import TruncatedSVD

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [2]:
text1 = "NLP"
text2 = "NLP stands for Natural Language Processing" 
text3 = "NLP oncerned with giving computers the ability to understand text and spoken words in much the same way human beings can."
text4 = "NLP helps computers communicate with humans in their own language and scales other language-related tasks. "

data = pd.DataFrame()
data["sentences"] = [text1, text2, text3, text4]
data.head()

Unnamed: 0,sentences
0,NLP
1,NLP stands for Natural Language Processing
2,NLP oncerned with giving computers the ability...
3,NLP helps computers communicate with humans in...


In [5]:
data['clean_sentences'] = data['sentences'].str.replace("[^a-zA-Z#]", " ")
data['clean_sentences'] = data['clean_sentences'].fillna('').apply(lambda x: ' '.join([i for i in x.split() if len(i)>2]))
data['clean_sentences'] = data['clean_sentences'].fillna('').apply(lambda x: x.lower())

data.head()

  data['clean_sentences'] = data['sentences'].str.replace("[^a-zA-Z#]", " ")


Unnamed: 0,sentences,clean_sentences
0,NLP,nlp
1,NLP stands for Natural Language Processing,nlp stands for natural language processing
2,NLP oncerned with giving computers the ability...,nlp oncerned with giving computers the ability...
3,NLP helps computers communicate with humans in...,nlp helps computers communicate with humans th...


In [6]:
stop_words = stopwords.words('english')

In [8]:
tokenized, detokenized = data['clean_sentences'].fillna('').apply(lambda x: x.split()), []

tokenized = tokenized.apply(lambda x: [i for i in x if i not in stop_words]) 

for i in range(len(data)):
    m = ' '.join(tokenized[i])
    detokenized.append(m)

data['clean_sentences'] = detokenized
data.head()

Unnamed: 0,sentences,clean_sentences
0,NLP,nlp
1,NLP stands for Natural Language Processing,nlp stands natural language processing
2,NLP oncerned with giving computers the ability...,nlp oncerned giving computers ability understa...
3,NLP helps computers communicate with humans in...,nlp helps computers communicate humans languag...


In [9]:
vectorizer = TfidfVectorizer(stop_words='english', smooth_idf=True)
X = vectorizer.fit_transform(data['clean_sentences'])
X.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.39953968, 0.50676543,
        0.26445122, 0.        , 0.50676543, 0.        , 0.        ,
        0.        , 0.50676543, 0.        , 0.        , 0.        ,
        0.        , 0.        ],
       [0.3029759 , 0.3029759 , 0.        , 0.23886968, 0.3029759 ,
        0.        , 0.3029759 , 0.        , 0.        , 0.        ,
        0.15810539, 0.3029759 , 0.        , 0.        , 0.        ,
        0.3029759 , 0.        , 0.        , 0.3029759 , 0.3029759 ,
        0.3029759 , 0.3029759 ],
       [0.        , 0.        , 0.32650667, 0.25742161, 0.        ,
        0.3265066

In [10]:
svd_model = TruncatedSVD(n_components=2, algorithm='randomized', n_iter=100, random_state=122)
lsa = svd_model.fit_transform(X)

In [13]:
pd.options.display.float_format = '{:,.16f}'.format
topic_encoded_df = pd.DataFrame(lsa, columns = ["topic_1", "topic_2"])
topic_encoded_df["sentences"] = data['clean_sentences']
display(topic_encoded_df[["sentences", "topic_1", "topic_2"]])

Unnamed: 0,sentences,topic_1,topic_2
0,nlp,0.6893541937369697,0.1246718716735474
1,nlp stands natural language processing,0.6983915622235068,-0.3729708345764374
2,nlp oncerned giving computers ability understa...,0.3796100071208374,0.8740881533757202
3,nlp helps computers communicate humans languag...,0.6366050822195641,-0.2470542700262436


In [14]:
dictionary = vectorizer.get_feature_names()
encoding_matrix = pd.DataFrame(svd_model.components_, index = ["topic_1","topic_2"], columns = (dictionary)).T

In [15]:
encoding_matrix

Unnamed: 0,topic_1,topic_2
ability,0.0760500032032879,0.2703105620073067
beings,0.0760500032032881,0.2703105620073068
communicate,0.1374407942294866,-0.0823349292101814
computers,0.168318596878155,0.1482021480090185
giving,0.0760500032032881,0.2703105620073068
helps,0.1374407942294866,-0.0823349292101814
human,0.0760500032032881,0.2703105620073068
humans,0.1374407942294866,-0.0823349292101814
language,0.4012266133040484,-0.2819294555418989
natural,0.2340234951993722,-0.1929219074318605
