# Clustering

In [36]:
from sklearn.cluster import KMeans

In [3]:
d1 = "He is a good guy, he is not bad"
d2 = "feet wolves cooked boys girls ,!<@!"
d3 = "He is not a good guy, he is bad"
d4 = "I drink water in parties"
d5 = "I grab a drink in parties"
d6 = "Seattle weather is bad in winter"
d7 = "Seattle Seahawks is a great football team"
d8 = "I love Seahawks"
d9 = "I learned a lot of Data analytics tools"
d10 = "I am a data scientist"
c4 = [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10]

In [37]:
# data normalization
import nltk
from nltk.corpus import stopwords
lemmatizer = nltk.stem.WordNetLemmatizer()
#stemmer = nltk.stem.PorterStemmer()
processed_c4 = []
for doc in c4:
    tokens = nltk.word_tokenize(doc.lower())
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token.isalpha()]
   # tokens = [stemmer.stem(token) for token in tokens]
    tokens = [token for token in tokens if not token in stopwords.words('english')]
    processed_c4.append(" ".join(tokens))
print(processed_c4)

['good guy bad', 'foot wolf cooked boy girl', 'good guy bad', 'drink water party', 'grab drink party', 'seattle weather bad winter', 'seattle seahawks great football team', 'love seahawks', 'learned lot data analytics tool', 'data scientist']


In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df = 2)
vectorizer.fit(processed_c4)
X = vectorizer.transform(processed_c4)
print(X.toarray())

[[0.4007734  0.         0.         0.45808861 0.45808861 0.45808861
  0.45808861 0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.4007734  0.         0.         0.45808861 0.45808861 0.45808861
  0.45808861 0.         0.         0.        ]
 [0.         0.         0.70710678 0.         0.         0.
  0.         0.70710678 0.         0.        ]
 [0.         0.         0.70710678 0.         0.         0.
  0.         0.70710678 0.         0.        ]
 [0.65845424 0.         0.         0.         0.         0.
  0.         0.         0.         0.75262077]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.70710678 0.70710678]
 [0.         0.         0.         0.         0.         0.
  0.         0.         1.         0.        ]
 [0.         1.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         1.      

## Similarity 

In [39]:
from sklearn.metrics.pairwise import*
# see similarity between 6 & 7
cosine_similarity(X[6], X[7])

array([[0.70710678]])

In [19]:
model = KMeans(n_clusters = 4)
model.fit(X)
model.labels_

array([1, 2, 1, 3, 3, 2, 2, 2, 0, 0], dtype=int32)

In [22]:
# interpret the clusters, look at the clusters based on its most freq words
# tfidf matrix of 4 cluster centers
model.cluster_centers_

array([[0.        , 1.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.4007734 , 0.        , 0.        , 0.45808861, 0.45808861,
        0.45808861, 0.45808861, 0.        , 0.        , 0.        ],
       [0.16461356, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.4267767 , 0.36493189],
       [0.        , 0.        , 0.70710678, 0.        , 0.        ,
        0.        , 0.        , 0.70710678, 0.        , 0.        ]])

In [26]:
# show the index in descending order
ordered_centroids = model.cluster_centers_.argsort()[:, ::-1] # all clusters, all index, and stride = -1, meaning come backward

ordered_centroids

array([[1, 9, 8, 7, 6, 5, 4, 3, 2, 0],
       [6, 5, 4, 3, 0, 9, 8, 7, 2, 1],
       [8, 9, 0, 7, 6, 5, 4, 3, 2, 1],
       [7, 2, 9, 8, 6, 5, 4, 3, 1, 0]])

In [1]:
terms = vectorizer.get_feature_names() # or get_feature_names_out()
terms[8]

NameError: name 'vectorizer' is not defined

In [30]:
for c in range(4):
    print('Cluster %d:' %c)
    for i in ordered_centroids[c, :4]: # get the first 4 terms of each cluster
        print(terms[i])

Cluster 0:
data
seattle
seahawks
party
Cluster 1:
guy bad
guy
good guy
good
Cluster 2:
seahawks
seattle
bad
party
Cluster 3:
party
drink
seattle
seahawks


# LDA for topic modeling 

In [40]:
from sklearn.decomposition import LatentDirichletAllocation

In [43]:
lda = LatentDirichletAllocation(n_components=4).fit(X)

In [44]:
# The model lda's attribute components_ stores topic word distribution. The array components_[i, j] 
# can be viewed as pseudocount that represents the number of times word j was assigned to topic i.
lda.components_ # counts(prob) of word j to appear in topic i

array([[1.06345727, 0.25023589, 0.2502491 , 1.16543301, 1.16543301,
        1.16543301, 1.16543301, 0.2502491 , 0.25055307, 0.25029527],
       [0.25143948, 0.2507643 , 0.25081188, 0.25044671, 0.25044671,
        0.25044671, 0.25044671, 0.25081188, 0.25183149, 0.25083688],
       [0.89454113, 0.25020076, 1.66283124, 0.25012054, 0.25012054,
        0.25012054, 0.25012054, 1.66283124, 0.92590418, 1.7081386 ],
       [0.25056316, 2.24879906, 0.25032135, 0.25017695, 0.25017695,
        0.25017695, 0.25017695, 0.25032135, 1.27881804, 0.2504568 ]])

In [46]:
for i, topic in enumerate(lda.components_):
    print('Topic %d:' % i )
    for j in topic.argsort()[:-4-1:-1]:
        print(terms[j])

Topic 0:
guy bad
guy
good guy
good
Topic 1:
seahawks
bad
seattle
party
Topic 2:
seattle
party
drink
seahawks
Topic 3:
data
seahawks
bad
seattle


In [49]:
for i, topic in enumerate(lda.components_):
    print('Topic %d:' % i )
    for j in topic.argsort()[:-5:-1]:
        print(terms[j])

Topic 0:
guy bad
guy
good guy
good
Topic 1:
seahawks
bad
seattle
party
Topic 2:
seattle
party
drink
seahawks
Topic 3:
data
seahawks
bad
seattle


In [47]:
for i, topic in enumerate(lda.components_):
    print('Topic %d:' % i )
    for j in topic.argsort()[:-4:-1]:
        print(terms[j])

Topic 0:
guy bad
guy
good guy
Topic 1:
seahawks
bad
seattle
Topic 2:
seattle
party
drink
Topic 3:
data
seahawks
bad


In [48]:
lda.transform(X[5:8]) # the prob of topic to appear in doc

array([[0.11078412, 0.10427414, 0.68102396, 0.10391778],
       [0.10372815, 0.10413175, 0.67214944, 0.11999066],
       [0.12512807, 0.12542264, 0.12922078, 0.62022851]])

# Sentiment Analysis

In [50]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [53]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import pandas as pd

nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/menghsuanlee/nltk_data...


True

In [57]:
sentences = ["They are smart, cute, and funny.",  # positive sentence example
    "They are smart, cute, and funny!", # punctuation emphasis handled correctly (sentiment intensity adjusted)
    "They are very smart, cute, and funny.",# booster words handled correctly (sentiment intensity adjusted)
    "They are VERY SMART, cute, and FUNNY.",  # emphasis for ALLCAPS handled
    "They are VERY SMART, cute, and FUNNY!!!",  # combination of signals - VADER appropriately adjusts intensity
    "They are VERY SMART, really handsome, and INCREDIBLY FUNNY!!!",  # booster words & punctuation make this close to ceiling for 
    "The book was good.",  # positive sentence
    "The book was kind of good.",  # qualified positive sentence is handled correctly (intensity adjusted)
    "The plot was good, but the characters are uncompelling and the dialog is not great.",  # mixed negation sentence
    "A really bad, horrible book.",  # negative sentence with booster words
    "At least it isn't a horrible book.",  # negated negative sentence with contraction
    ":) and :D",  # emoticons handled
    "",  # an empty string is correctly handled
    "Today sux",  # negative slang handled
    "Today sux!",  # negative slang with punctuation emphasis handled
    "Today SUX!",  # negative slang with capitalization emphasis
    "Today kinda sux! But I'll get by, lol" # mixed sentiment example with slang and contrastive conjunction "but"
            ]

In [59]:
sid = SentimentIntensityAnalyzer()
scores = []
for s in sentences:
    score = {'sentence': s}
    score.update(sid.polarity_scores(s))
    
    scores.append(score)

pd.DataFrame(scores)

Unnamed: 0,sentence,neg,neu,pos,compound
0,"They are smart, cute, and funny.",0.0,0.259,0.741,0.8225
1,"They are smart, cute, and funny!",0.0,0.252,0.748,0.8356
2,"They are very smart, cute, and funny.",0.0,0.304,0.696,0.847
3,"They are VERY SMART, cute, and FUNNY.",0.0,0.249,0.751,0.9196
4,"They are VERY SMART, cute, and FUNNY!!!",0.0,0.236,0.764,0.9318
5,"They are VERY SMART, really handsome, and INCR...",0.0,0.294,0.706,0.9469
6,The book was good.,0.0,0.508,0.492,0.4404
7,The book was kind of good.,0.0,0.657,0.343,0.3832
8,"The plot was good, but the characters are unco...",0.327,0.579,0.094,-0.7042
9,"A really bad, horrible book.",0.791,0.209,0.0,-0.8211
