In [1]:
import pandas as pd

CLIMATE = '/kaggle/input/public-opinion-on-nasas-climate-posts-fb-data/climate_nasa.csv'
df = pd.read_csv(filepath_or_buffer=CLIMATE, parse_dates=['date'])
df['year'] = df['date'].dt.year
df['commentsCount'] = df['commentsCount'].fillna(value=0)
df = df.dropna(subset=['text'])
df.head()

Unnamed: 0,date,likesCount,profileName,commentsCount,text,year
0,2022-09-07 17:12:32+00:00,2,4dca617d86b3fdce80ba7e81fb16e048c9cd9798cdfd6d...,0.0,Neat comparison I have not heard it before.\n ...,2022
1,2022-09-08 14:51:13+00:00,0,518ab97f2d115ba5b6f03b2fba2ef2b120540c9681288b...,0.0,An excellent way to visualise the invisible! T...,2022
2,2022-09-07 17:19:41+00:00,1,d82e8e24eb633fd625b0aef9b3cb625cfb044ceb8483e1...,3.0,Does the CO2/ghg in the troposphere affect the...,2022
3,2022-09-08 00:51:30+00:00,4,37a509fa0b5177a2233c7e2d0e2b2d6916695fa9fba3f2...,0.0,excellent post! I defo feel the difference - o...,2022
4,2022-09-07 19:06:20+00:00,16,e54fbbd42a729af9d04d9a5cc1f9bbfe8081a31c219ecb...,26.0,"Yes, and carbon dioxide does not harm the Eart...",2022


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 504 entries, 0 to 521
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype              
---  ------         --------------  -----              
 0   date           504 non-null    datetime64[ns, UTC]
 1   likesCount     504 non-null    int64              
 2   profileName    504 non-null    object             
 3   commentsCount  504 non-null    float64            
 4   text           504 non-null    object             
 5   year           504 non-null    int32              
dtypes: datetime64[ns, UTC](1), float64(1), int32(1), int64(1), object(2)
memory usage: 25.6+ KB


In [3]:
df.nunique()

date             504
likesCount        41
profileName      468
commentsCount     40
text             503
year               4
dtype: int64

In [4]:
from plotly.express import histogram
histogram(data_frame=df, x='date', marginal='box')

In [5]:
from plotly.express import scatter
scatter(data_frame=df, x='likesCount', y='commentsCount', color='year', log_x=True, log_y=True, trendline='ols')

Rarely do we see like counts and comment counts so close to being uncorrelated.

In [6]:
%env TOKENIZERS_PARALLELISM=false
!pip install --quiet keybert
print('pip install keybert complete.')

env: TOKENIZERS_PARALLELISM=false
pip install keybert complete.


In [7]:
from arrow import now
from keybert import KeyBERT
from sklearn.feature_extraction.text import TfidfVectorizer

MAX_DF = 1.0
MIN_DF = 2
MODEL = 'all-MiniLM-L12-v2'
STOP_WORDS = 'english'
# we use the clean text for keywords even though we show a truncated original message
DOCS = df['text'].values.tolist()

model_start = now()
model = KeyBERT(model=MODEL,)
# we will capture almost all of the content with the default max sequence length of 128
vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=STOP_WORDS, min_df=MIN_DF, max_df=MAX_DF, )
document_embeddings, word_embeddings = model.extract_embeddings(docs=DOCS, vectorizer=vectorizer, )
print('embedding time: {}'.format(now() - model_start))
print('we have {} documents and {} words.'.format(len(document_embeddings), len(word_embeddings)))
keywords = model.extract_keywords(docs=DOCS, top_n=1, stop_words=STOP_WORDS, vectorizer=vectorizer,
                                  doc_embeddings=document_embeddings, word_embeddings=word_embeddings, min_df=MIN_DF, )
print('model time: {}'.format(now() - model_start))
df['keyword'] = [keyword[0][0] if len(keyword) else '-none-' for keyword in keywords]

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/573 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/134M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/352 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

embedding time: 0:00:17.111983
we have 504 documents and 1107 words.
model time: 0:00:17.445181


In [8]:
df['keyword'].value_counts(normalize=True).head(n=15)

keyword
climate         0.111111
-none-          0.081349
warming         0.049603
nasa            0.039683
temperature     0.027778
co2             0.021825
weather         0.017857
emissions       0.013889
temperatures    0.011905
science         0.009921
earth           0.009921
deniers         0.009921
scientists      0.009921
carbon          0.009921
planet          0.009921
Name: proportion, dtype: float64

From the looks of things we have a discussion of science and another discussion of politics, in a single corpus.

In [9]:
histogram(data_frame=df[df['keyword'] != '-none-']['keyword'].value_counts().to_frame().reset_index().head(n=40), x='keyword', y='count', marginal='box')

In [10]:
import pandas as pd
from umap import UMAP

IGNORE = {'-none-', }

umap_start = now()
umap_model = UMAP(n_components=2, random_state=2024, verbose=False, n_jobs=1)
df[['u0', 'u1']] = umap_model.fit_transform(X=document_embeddings)
print('UMAP time: {}'.format(now() - umap_start))

UMAP time: 0:00:07.288829


In [11]:
scatter(data_frame=df, x='u0', y='u1', hover_name='text', hover_data=['keyword'] ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)

We have one non-English cluster; the other documents do not cluster very tightly.

In [12]:
keywords = [keyword for keyword in df['keyword'].value_counts().head(n=13).index.tolist() if keyword != '-none-']
scatter(data_frame=df[df['keyword'].isin(keywords)], x='u0', y='u1', hover_name='text', color='keyword', hover_data=['keyword'] ).update_xaxes(showticklabels=False).update_yaxes(showticklabels=False).update_xaxes(visible=False).update_yaxes(visible=False)