In [2]:
import os
# set available gpu's
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

# nltk downloads
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # remove numerical values
    text = re.sub(r'\d+', '', text)
    # tokenize
    tokens = nltk.word_tokenize(text)
    # remove stopwords and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join tokens back into a single string
    return ' '.join(tokens)

# Sample corpus
corpus = [
    "The quick brown fox jumps over the lazy dog.",
    "Natural Language Processing is fascinating!",
    "123 Dogs and cats live in harmony."
]

# Preprocess the corpus
preprocessed_corpus = [preprocess(doc) for doc in corpus]

# Initialize and apply TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_corpus)

# Display the TF-IDF matrix
print(X.toarray())

# Display the feature names (terms)
print(vectorizer.get_feature_names_out())


[nltk_data] Downloading package punkt to
[nltk_data]     /cfs/home/u023967/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /cfs/home/u023967/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /cfs/home/u023967/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


[[0.42339448 0.         0.32200242 0.         0.42339448 0.
  0.42339448 0.         0.42339448 0.         0.         0.
  0.42339448]
 [0.         0.         0.         0.5        0.         0.
  0.         0.5        0.         0.         0.5        0.5
  0.        ]
 [0.         0.52863461 0.40204024 0.         0.         0.52863461
  0.         0.         0.         0.52863461 0.         0.
  0.        ]]
['brown' 'cat' 'dog' 'fascinating' 'fox' 'harmony' 'jump' 'language'
 'lazy' 'live' 'natural' 'processing' 'quick']


In [4]:
type(X)

scipy.sparse._csr.csr_matrix

In [5]:
# convert data instance to string to be embeded
def text_semeval(example: dict) -> str:

    prim = ''.join(example['primary_evidence'])
    if 'secondary_evidence' in example.keys():
        sec = ''.join(example['secondary_evidence'])
        text = prim + sec + example['statement']
    else:
        text = prim + example['statement']
    
    return text

In [6]:
from evo_functions import extract_SemEval_data

  from .autonotebook import tqdm as notebook_tqdm


🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [7]:
data = extract_SemEval_data(use_data_sorted_by_dq=True)
text_constructor = text_semeval

texts=[]
for example in data:
    text = text_constructor(example)
    #print(text)
    texts.append(text)

Used data with already retrieved examples from DATASETS/DATA_QUALITY/SemEval_data_quality.json


In [30]:
corpus = texts

# Preprocess the corpus
preprocessed_corpus = [preprocess(doc) for doc in corpus]

# Initialize and apply TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(preprocessed_corpus)

# Display the TF-IDF matrix
print(X.toarray())

# Display the feature names (terms)
print(vectorizer.get_feature_names_out())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
['__' 'ab' 'abc' ... 'μg' 'μl' 'μmoll']


In [9]:
type(X.toarray())

numpy.ndarray

In [10]:
X.toarray().shape

(1900, 7752)

In [11]:
X.toarray()[0].tolist()

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.04322565322795553,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.060611115666128416,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.06457347252756775,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.

# HDBSCAN

In [31]:
X = X.toarray()

In [38]:
from evo_functions import embed_texts

In [39]:
X = embed_texts(texts)

In [32]:
from sklearn.cluster import HDBSCAN
from sklearn.datasets import load_digits

In [13]:
X, _ = load_digits(return_X_y=True)


In [14]:
type(X)

numpy.ndarray

In [15]:
X.shape

(1797, 64)

In [33]:
from sklearn.metrics import pairwise_distances

In [34]:
X = X

In [40]:
# Compute cosine distance matrix
cosine_dist_matrix = pairwise_distances(X, metric='cosine')

# Fit HDBSCAN using precomputed distances
hdb = HDBSCAN(metric='precomputed')
hdb.fit(cosine_dist_matrix)

# Get cluster labels
labels = hdb.labels_
print(labels)

[18 18 18 ... 18 25 18]


In [43]:
from collections import Counter

In [44]:
Counter(labels)

Counter({-1: 776,
         18: 449,
         11: 438,
         27: 29,
         16: 14,
         23: 13,
         13: 11,
         15: 10,
         2: 10,
         28: 9,
         19: 9,
         25: 8,
         26: 8,
         21: 8,
         0: 8,
         10: 7,
         3: 7,
         5: 6,
         9: 6,
         8: 6,
         14: 6,
         24: 6,
         22: 6,
         12: 6,
         30: 6,
         20: 6,
         7: 6,
         1: 6,
         4: 5,
         29: 5,
         17: 5,
         6: 5})

In [1]:
rr

NameError: name 'rr' is not defined

In [21]:
hdb.probabilities_

array([0.63294794, 0.        , 0.        , 0.86847457, 0.        ,
       0.84532843, 1.        , 0.        , 0.        , 0.        ,
       0.5004701 , 1.        , 0.        , 1.        , 0.67681859,
       0.89134792, 0.48797805, 1.        , 0.        , 0.        ,
       0.62737943, 1.        , 0.66941469, 0.91888854, 0.        ,
       0.81186032, 1.        , 0.48702382, 0.        , 0.        ,
       0.45351162, 0.        , 1.        , 0.81056519, 0.80546511,
       0.90343044, 0.57580794, 0.        , 0.        , 0.86416955,
       0.8584657 , 0.98431419, 0.77977061, 0.62405083, 0.7852729 ,
       0.        , 0.        , 0.73453158, 0.53196819, 0.50432568,
       0.        , 0.        , 0.68697123, 0.        , 0.        ,
       0.82407327, 0.77977061, 0.        , 1.        , 0.        ,
       1.        , 0.97914269, 1.        , 1.        , 0.        ,
       1.        , 1.        , 0.70536946, 0.        , 0.        ,
       0.6294732 , 0.78481086, 0.62947425, 1.        , 0.76759

In [25]:
hdb.n_features_in_

500

In [24]:
hdb.n_clusters_

AttributeError: 'HDBSCAN' object has no attribute 'n_clusters_'