In [1]:
!pip install umap-learn
!pip install elemeta
!pip install transformers
!pip install eli5==0.11.0
!pip install jinja2==3.0.0
!pip install scikit-learn==0.21.3

Collecting umap-learn
  Downloading umap-learn-0.5.3.tar.gz (88 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.2/88.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.10.tar.gz (1.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: umap-learn, pynndescent
  Building wheel for umap-learn (setup.py) ... [?25l[?25hdone
  Created wheel for umap-learn: filename=umap_learn-0.5.3-py3-none-any.whl size=82807 sha256=876f80f0894debea38661095111505c1e07baf26b659852e7137991cc08c7cff
  Stored in directory: /root/.cache/pip/wheels/a0/e8/c6/a37ea663620bd5200ea1ba0907ab3c217042c1d035ef606acc
  Building wheel for pynndescent (setup.py) ... [?25l[?25hdone
  Created wheel for py

In [2]:
import pandas as pd
import numpy as np
import umap
from umap import UMAP
import plotly
import plotly.express as px

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix


In [3]:
train_df = pd.read_parquet(
    "http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/nlp/sentiment-classification-language-drift/sentiment_classification_language_drift_training.parquet",
)
prod_df = pd.read_parquet(
    "http://storage.googleapis.com/arize-assets/phoenix/datasets/unstructured/nlp/sentiment-classification-language-drift/sentiment_classification_language_drift_production.parquet",
)

In [4]:
prod_df.head()

Unnamed: 0,prediction_ts,reviewer_age,reviewer_gender,product_category,language,text,text_vector,label,pred_label,prediction_id
0,1651388000.0,59,female,pet_products,english,"Overall good product, however both handle and ...","[-0.81321067, -0.061673447, 0.2231456, 0.63300...",neutral,neutral,24e2ba98-9b6a-4245-9239-2038d77f276d
1,1651389000.0,26,female,kitchen,english,they are great for the money and are really ea...,"[-0.0063662454, -0.33201334, 0.10614053, -0.13...",positive,positive,a094eb58-81ba-4cff-9735-2129fb5bbacb
2,1651389000.0,31,female,sports,english,Bought it for a weekend trip to some springs. ...,"[-0.19587253, -0.9300667, 0.35202777, -0.17050...",positive,positive,3c7f069e-7869-4754-b7f8-d4dbe54dd4d2
3,1651389000.0,21,female,jewelry,english,love these Los Muertos skulls i have 2 of them...,"[-0.019824374, -0.38698494, 0.28868282, -0.157...",positive,positive,94bfe44a-01c7-48f1-baa8-ec3a592d80fc
4,1651389000.0,32,male,office_product,english,Gorgeous! I didn't keep it because it was the ...,"[-0.022711936, -0.62593263, -0.10312944, -0.28...",positive,positive,fef74b70-c1f3-4fed-b2d6-ffc41b779b54


In [5]:
test_labels = train_df['label']
y_pred = train_df['pred_label']

confusion_matrix = confusion_matrix(test_labels, y_pred)
print (confusion_matrix)

[[2500  171    3]
 [ 177 2349   74]
 [   8   50 2668]]


***
# Meta features extraction using Elemeta

In [6]:
%%time

df_feedback = train_df[['text', 'pred_label', 'language']]

from elemeta.nlp.metafeature_extractors_runner import MetafeatureExtractorsRunner, SentimentPolarity, SentimentSubjectivity
from elemeta.nlp.extractors.high_level.hinted_profanity_words_count import HintedProfanityTokensCount
from elemeta.nlp.extractors.high_level.text_complexity import TextComplexity

sentiment_polarity = SentimentPolarity()
sentiment_subjectivity = SentimentSubjectivity()

metafeature_extractors_runner_senti = MetafeatureExtractorsRunner(metafeature_extractors=[sentiment_polarity,
                                                                                          sentiment_subjectivity])
df_sentiment = metafeature_extractors_runner_senti.run_on_dataframe(dataframe = df_feedback, text_column = "text")

simple_word_tokinzer = lambda text: text.split(" ")
# simple_line_tokinzer = lambda text: text.split("\n")
hinted_profanity = HintedProfanityTokensCount(simple_word_tokinzer)
df_sentiment['hinted_profanity'] = df_sentiment.apply(lambda x: hinted_profanity(x['text']), axis=1)

text_complexity = TextComplexity()
df_sentiment['text_complexity'] = df_sentiment.apply(lambda x: text_complexity(x['text']), axis=1)

df_sentiment.head()

CPU times: user 3min 26s, sys: 2.17 s, total: 3min 28s
Wall time: 4min


Unnamed: 0,text,pred_label,language,sentiment_polarity,sentiment_subjectivity,hinted_profanity,text_complexity
0,Poor quality of fabric and ridiculously tight ...,negative,english,-0.6705,0.395238,0,81.8
1,"Love these glasses, thought they'd be everyday...",positive,english,0.9421,0.716667,0,77.23
2,"These are disgusting, it tastes like you are ""...",negative,english,0.8554,0.770486,0,74.9
3,My husband has a pair of TaoTronics so I decid...,neutral,english,-0.3802,0.638889,0,71.14
4,"Threads too deep. Engages on tank, but gasket ...",negative,english,0.5901,0.533333,0,90.77


In [8]:
df_sentiment.shape

(8000, 7)

In [9]:
# Checking for presence of profanity
profane = df_sentiment[df_sentiment.hinted_profanity > 0]
profane

Unnamed: 0,text,pred_label,language,sentiment_polarity,sentiment_subjectivity,hinted_profanity,text_complexity
47,FUCKING SHITTY GAME!! I TRIED GETTING MY PROGR...,negative,english,0.4534,0.457143,7,92.12
85,I really love this dress. It's very hard to fi...,neutral,english,0.9132,0.469102,1,82.65
231,I ordered the 4 mm for my first piercing and t...,positive,english,0.9720,0.465408,1,85.08
315,"I really like these delicate flowers. However,...",neutral,english,0.5390,0.403571,1,88.74
334,"Well let me see . . . If you like hot, sizzlin...",neutral,english,0.7178,0.801852,2,87.11
...,...,...,...,...,...,...,...
7891,Parts that snap together are thin and brittle ...,neutral,english,-0.3612,0.425000,1,96.52
7901,The fit is too big and came with a smell I can...,negative,english,-0.0593,0.642857,1,93.34
7930,This hat has a good quality and is very well m...,positive,english,0.4987,0.426531,1,92.63
7973,Pot metal. Breaks easily. Looks cool.,neutral,english,0.5719,0.741667,1,73.85


In [10]:
# printing rows with profanity word count > 3 aka extremely abusive language
severe_profane = df_sentiment[df_sentiment.hinted_profanity > 3]
severe_profane = severe_profane[['text', 'pred_label', 'sentiment_polarity']]
severe_profane

Unnamed: 0,text,pred_label,sentiment_polarity
47,FUCKING SHITTY GAME!! I TRIED GETTING MY PROGR...,negative,0.4534
1561,I ordered these screw drivers to remove screw ...,negative,-0.5369


***
# Vectorization & UMAP clustering over extracted sentiment polarity

In [11]:
# Polarity converted to class

df_sen = df_sentiment.copy()

def polarity_to_sentiment(polarity):
    if polarity == 0:
        return "Neutral"
    elif polarity < 0:
      if polarity > -0.49:
        return "Severe Negative"
      else:
        return "Negative"
    elif polarity > 0:
      if polarity > 0.49:
        return "Super Positive"
      else:
        return "Positive"

df_sen['Sentiment'] = df_sen.apply(lambda x: polarity_to_sentiment(x['sentiment_polarity']), axis=1)
df_sen['Sentiment'] = pd.Categorical(df_sen.Sentiment)

df_sen.head()

Unnamed: 0,text,pred_label,language,sentiment_polarity,sentiment_subjectivity,hinted_profanity,text_complexity,Sentiment
0,Poor quality of fabric and ridiculously tight ...,negative,english,-0.6705,0.395238,0,81.8,Negative
1,"Love these glasses, thought they'd be everyday...",positive,english,0.9421,0.716667,0,77.23,Super Positive
2,"These are disgusting, it tastes like you are ""...",negative,english,0.8554,0.770486,0,74.9,Super Positive
3,My husband has a pair of TaoTronics so I decid...,neutral,english,-0.3802,0.638889,0,71.14,Severe Negative
4,"Threads too deep. Engages on tank, but gasket ...",negative,english,0.5901,0.533333,0,90.77,Super Positive


In [12]:
vec1 = CountVectorizer(min_df = 5, stop_words = 'english')
word_doc_mat = vec1.fit_transform(df_sen.text)

emb_umap1 = UMAP(metric='cosine', verbose=True).fit_transform(word_doc_mat)
emb_umap1



UMAP(angular_rp_forest=True, metric='cosine', verbose=True)
Tue Aug 29 05:19:59 2023 Construct fuzzy simplicial set
Tue Aug 29 05:19:59 2023 Finding Nearest Neighbors
Tue Aug 29 05:19:59 2023 Building RP forest with 9 trees
Tue Aug 29 05:20:14 2023 metric NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
	 6  /  13
	 7  /  13
	 8  /  13
	Stopping threshold met -- exiting after 8 iterations
Tue Aug 29 05:20:41 2023 Finished Nearest Neighbor Search
Tue Aug 29 05:20:45 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Tue Aug 29 05:21:02 2023 Finished embedding


array([[8.0159645, 2.5352688],
       [6.008429 , 1.6598227],
       [5.40122  , 4.8490863],
       ...,
       [9.877199 , 1.5499694],
       [5.4615865, 1.6468887],
       [9.444697 , 1.1013676]], dtype=float32)

In [13]:
df_clus = pd.DataFrame(data = emb_umap1, columns = ['UmapComp1', 'UmapComp2'])
df_clus['Sentiment'] = df_sen['Sentiment']
df_clus['Sentiment Polarity'] = df_sen['sentiment_polarity']

df_clus = df_clus.dropna()

plt_df = df_clus[['UmapComp1', 'UmapComp2', 'Sentiment', 'Sentiment Polarity']]

# Plotting
label_encoder = preprocessing.LabelEncoder()
plt_df['Sentiment'] = label_encoder.fit_transform(plt_df['Sentiment'])
std_scaler = StandardScaler()
cluster = std_scaler.fit_transform(plt_df.to_numpy())
km = KMeans(random_state = 42, n_init = 10, max_iter=100)
km.fit(cluster)
df_clus['label'] = km.labels_
df_clus = df_clus.round(decimals = 5)
sen_fig1 = px.scatter_3d(df_clus,
                        x = 'UmapComp1',
                        y = 'UmapComp2',
                        z = 'Sentiment',
                        color = df_clus['Sentiment Polarity'],
                        category_orders={'Sentiment':['Severe Negative', 'Negative', 'Neutral', 'Positive', 'Super Positive']},
                        height = 1200,
                        width = 1200)

sen_fig1.update_layout(dragmode='select',
                      activeselection=dict(fillcolor='yellow'))

sen_fig1.show()

***
# Vectorization & UMAP clustering over extracted sentiment subjectivity

In [14]:
df_sub = df_sentiment.copy()

# subjectivity converted to class
def subjectivity_to_class(subjectivity):
    if subjectivity < 0.4:
        return "Factual"
    elif subjectivity > 0.6:
        return "Personal"
    else:
      return "Neutral"

df_sub['Subjectivity'] = df_sub.apply(lambda x: subjectivity_to_class(x['sentiment_subjectivity']), axis=1)
df_sub['Subjectivity'] = pd.Categorical(df_sub.Subjectivity)

df_sub.head()

Unnamed: 0,text,pred_label,language,sentiment_polarity,sentiment_subjectivity,hinted_profanity,text_complexity,Subjectivity
0,Poor quality of fabric and ridiculously tight ...,negative,english,-0.6705,0.395238,0,81.8,Factual
1,"Love these glasses, thought they'd be everyday...",positive,english,0.9421,0.716667,0,77.23,Personal
2,"These are disgusting, it tastes like you are ""...",negative,english,0.8554,0.770486,0,74.9,Personal
3,My husband has a pair of TaoTronics so I decid...,neutral,english,-0.3802,0.638889,0,71.14,Personal
4,"Threads too deep. Engages on tank, but gasket ...",negative,english,0.5901,0.533333,0,90.77,Neutral


In [15]:
vec1 = CountVectorizer(min_df = 5, stop_words = 'english')
word_doc_mat = vec1.fit_transform(df_sub.text)

emb_umap = UMAP(metric='cosine', verbose=True).fit_transform(word_doc_mat)
emb_umap



UMAP(angular_rp_forest=True, metric='cosine', verbose=True)
Tue Aug 29 05:22:22 2023 Construct fuzzy simplicial set
Tue Aug 29 05:22:22 2023 Finding Nearest Neighbors
Tue Aug 29 05:22:22 2023 Building RP forest with 9 trees
Tue Aug 29 05:22:23 2023 metric NN descent for 13 iterations
	 1  /  13
	 2  /  13
	 3  /  13
	 4  /  13
	 5  /  13
	 6  /  13
	 7  /  13
	 8  /  13
	Stopping threshold met -- exiting after 8 iterations
Tue Aug 29 05:22:30 2023 Finished Nearest Neighbor Search
Tue Aug 29 05:22:30 2023 Construct embedding


Epochs completed:   0%|            0/500 [00:00]

Tue Aug 29 05:22:46 2023 Finished embedding


array([[11.752719 ,  4.3989115],
       [ 9.445154 ,  4.4349856],
       [ 9.576993 ,  1.8615578],
       ...,
       [13.306024 ,  5.7812457],
       [ 8.6132   ,  4.4251924],
       [12.706146 ,  5.9558244]], dtype=float32)

In [17]:
df_clus = pd.DataFrame(data = emb_umap, columns = ['UmapComp1', 'UmapComp2'])
df_clus['Subjectivity'] = df_sub['Subjectivity']
# Clustered over subjectivity classes but embeddings colored with sentiment polarity
df_clus['Sentiment Polarity'] = df_sub['sentiment_polarity']

df_clus = df_clus.dropna()

plt_df = df_clus[['UmapComp1', 'UmapComp2', 'Subjectivity', 'Sentiment Polarity']]

# Plotting
label_encoder = preprocessing.LabelEncoder()
plt_df['Subjectivity'] = label_encoder.fit_transform(plt_df['Subjectivity'])
std_scaler = StandardScaler()
cluster = std_scaler.fit_transform(plt_df.to_numpy())
km = KMeans(random_state = 42, n_init = 10, max_iter=100)
km.fit(cluster)
df_clus['label'] = km.labels_
df_clus = df_clus.round(decimals = 5)
sub_fig = px.scatter_3d(df_clus,
                        x = 'UmapComp1',
                        y = 'UmapComp2',
                        z = 'Subjectivity',
                        color = df_clus['Sentiment Polarity'],
                        category_orders={'Subjectivity':['Personal', 'Neutral', 'Factual']},
                        height = 1200,
                        width = 1200)

sub_fig.update_layout(dragmode='select',
                      activeselection=dict(fillcolor='yellow'))

sub_fig.show()