In [400]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
import tensorflow_hub as hub
import plotly.express as px
import pandas as pd
import warnings
import hdbscan
import spacy
import re
warnings.filterwarnings('ignore') 

# Łączenie plików

In [401]:
file_paths = [
    "../annotations/texts/all/rev_bartek_whole.csv",
    "../annotations/texts/all/rev_eryk_whole.csv",
    "../annotations/texts/all/rev_darek_whole.csv",
    "../annotations/texts/all/rev_patryk_whole.csv",
    "../annotations/texts/final_texts.csv"
]

dfs = [pd.read_csv(file) for file in file_paths]

combined_df = pd.concat(dfs, ignore_index=True)

In [402]:
combined_df.head()

Unnamed: 0,id,text,label,Comments
0,322337,Można się pozgłaszać na cwiczeniach z analizy ...,Neutral,
1,322338,"Polecam wziąć sobie do serca wszystkie rady "" ...",Neutral,
2,322339,"Był em na jej zajęciach w 100 % , starał em si...",Neutral,
3,322340,Jesli sluchasz go na zajeciach i zrobisz w dom...,Neutral,
4,322341,Bardzo luzacki prowadzący .,Positive,


In [403]:
texts_df = combined_df['text'].fillna('')
labels = combined_df['label'].fillna('')

# Preprocessing

In [404]:
nlp = spacy.load("pl_core_news_sm")

def preprocess_text(text):

    text = re.sub(r'\d+', '', text)  # Usunięcie cyfr
    text = re.sub(r'\W+', ' ', text.lower())  # Usunięcie znaków interpunkcyjnych

    doc = nlp(text)
    
    tokens = [token.lemma_ for token in doc if not token.is_stop] # Lematyzacja
    
    return ' '.join(tokens)

In [405]:
texts = texts_df.apply(preprocess_text)

In [406]:
texts.head()

0      pozgłaszać cwiczenie analiza dostać trochę plus
1                           polecać wziąć serce rada j
2     zajęcia starać  mogła em nadgonić grupa zdawa...
3    sluchać zajeci zrobić dom zadanek powinienes p...
4                                    luzacki prowadzić
Name: text, dtype: object

# Generowanie osadzeń

In [407]:
vectorizer = TfidfVectorizer(max_features=1000)

tfidf_matrix = vectorizer.fit_transform(texts)

tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

In [408]:
tfidf_df.head()

Unnamed: 0,absolutnie,absolutny,aktualizacja,aktywność,akurat,alarm,algebro,algebry,analityczny,analiza,...,śmyć,średnio,środek,świat,świetnie,źle,żaden,żart,żarówka,życzenie
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.452387,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [409]:
model_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
embed = hub.load(model_url)

def generate_embeddings(texts):
    return embed(texts).numpy()

embeddings = generate_embeddings(texts.tolist())

embeddings_df = pd.DataFrame(embeddings)

In [410]:
embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.022776,-0.034864,0.039673,-0.020465,-0.047076,0.034509,0.050902,-0.047817,-0.034556,0.035569,...,-0.006126,-0.048756,0.027739,0.012175,-0.043583,-0.041169,0.007851,0.030071,-0.058633,-0.066715
1,0.064708,-0.059596,0.050324,-0.00695,0.048893,0.033492,-0.002916,-0.021219,0.003312,-0.032547,...,-0.04379,-0.082692,0.077276,0.016575,-0.026107,-0.05758,0.076561,-0.022957,0.001226,-0.073997
2,0.054768,-0.055792,-0.05257,-0.028063,-0.006749,-0.06596,-0.042303,0.010674,-0.09997,0.057676,...,0.001045,0.018698,0.004154,0.045637,0.056234,-0.037836,-0.000852,0.081776,0.015328,-0.091352
3,0.05952,-0.024957,-0.004416,0.028853,-0.00995,0.042348,0.054503,0.00853,0.005767,0.007712,...,-0.022986,-0.019342,0.085032,-0.08075,0.015194,-0.042662,0.029853,0.008092,-0.022786,-0.11064
4,-0.006367,-0.01222,0.065035,0.039759,0.044266,0.023789,-0.003222,-0.009262,-0.039947,0.003795,...,0.005564,-0.159424,0.011605,-0.003149,-0.001586,-0.041268,-0.034278,-0.019856,0.038922,-0.026927


# Tsne

In [411]:
tsne = TSNE(n_components=2, random_state=42, perplexity=12, n_iter=1000)
tsne_results_tdidf = tsne.fit_transform(tfidf_matrix.toarray())

tsne_df_tdidf = pd.DataFrame(tsne_results_tdidf, columns=['Dim1', 'Dim2'])
tsne_df_tdidf['Text'] = texts
tsne_df_tdidf['Label'] = labels

color_map = {
    "Neutral": "dodgerblue",
    "Positive": "green",
    "Negative": "red"
}

fig = px.scatter(
    tsne_df_tdidf, x='Dim1', y='Dim2', 
    color='Label', hover_data=['Text', 'Label'],
    title='Wizualizacja t-SNE dla osadzeń tekstów z TF-IDF',
    color_discrete_map=color_map
)

fig.update_traces(marker=dict(size=10))
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()

In [412]:
tsne = TSNE(n_components=2, random_state=42, perplexity=12, n_iter=1000)
tsne_results_use = tsne.fit_transform(embeddings)

tsne_df_use = pd.DataFrame(tsne_results_use, columns=['Dim1', 'Dim2'])
tsne_df_use['Text'] = texts
tsne_df_use['Label'] = labels

color_map = {
    "Neutral": "dodgerblue",
    "Positive": "green",
    "Negative": "red"
}

fig = px.scatter(
    tsne_df_use, x='Dim1', y='Dim2', 
    color='Label', hover_data=['Text', 'Label'],
    title='Wizualizacja t-SNE dla osadzeń tekstów z Universal Sentence Encoder',
    color_discrete_map=color_map
)

fig.update_traces(marker=dict(size=10))
fig.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()

# Klasteryzacja

In [413]:
clusterer_hdbscan = hdbscan.HDBSCAN(min_cluster_size=5, min_samples=50)
clusters_hdbscan_tfidf = clusterer_hdbscan.fit_predict(tfidf_matrix.toarray())
tsne_df_tdidf['HDBSCAN_Clusters'] = clusters_hdbscan_tfidf
tsne_df_tdidf['HDBSCAN_Clusters'] = tsne_df_tdidf['HDBSCAN_Clusters'].astype(str)

fig_hdbscan = px.scatter(
    tsne_df_tdidf, x='Dim1', y='Dim2', 
    color='HDBSCAN_Clusters', hover_data=['Text', 'HDBSCAN_Clusters'],
    title='Klasteryzacja z wykorzystaniem HDBSCAN dla osadzeń tekstów z TF-IDF'
)

fig_hdbscan.update_traces(marker=dict(size=10))
fig_hdbscan.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)

fig_hdbscan.for_each_trace(lambda t: t.update(name=f'Klaster {t.name}', legendgroup=t.name))

fig_hdbscan.show()

In [414]:
clusterer_hdbscan = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=5)
clusters_hdbscan_use = clusterer_hdbscan.fit_predict(embeddings)
tsne_df_use['HDBSCAN_Clusters'] = clusters_hdbscan_use
tsne_df_use['HDBSCAN_Clusters'] = tsne_df_use['HDBSCAN_Clusters'].astype(str)

fig_hdbscan = px.scatter(
    tsne_df_use, x='Dim1', y='Dim2', 
    color='HDBSCAN_Clusters', hover_data=['Text', 'HDBSCAN_Clusters'],
    title='Klasteryzacja z wykorzystaniem HDBSCAN dla osadzeń tekstów z Universal Sentence Encoder',
)

fig_hdbscan.update_traces(marker=dict(size=10))
fig_hdbscan.update_layout(
    title_x=0.5,
    title_font=dict(size=20)
)

fig_hdbscan.for_each_trace(lambda t: t.update(name=f'Klaster {t.name}', legendgroup=t.name))

fig_hdbscan.show()