In [None]:
pip install nltk scipy scikit-learn pandas matplotlib gensim pyspark

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import gensim

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as f
from pyspark.sql import Window
from pyspark.sql.types import StringType, DoubleType, IntegerType

spark = SparkSession.builder \
                    .appName('PK_BigData_Project') \
                    .config('spark.jars.packages', 'com.amazonaws:aws-java-sdk-bundle:1.12.170') \
                    .config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.3.3') \
                    .config("spark.driver.memory", "12g") \
                    .config("spark.executor.memory", "12g") \
                    .master('local[3]') \
                    .getOrCreate()

In [39]:
filePath = "s3a://pkruk-big-data/processed/"
tags_df = spark.read.parquet(filePath + 'Tags.parquet')
tags_15 = tags_df.select('tag_name').distinct()
tags_unique = list(map(lambda word: word.replace('-', '_'), [data[0] for data in tags_15.select('tag_name').collect()]))

In [None]:
import boto3

s3 = boto3.client('s3')

def read_from_s3():
    s3.download_file("pkruk-big-data", "articles/articles.txt", "articles.txt")

    with open("articles.txt") as f:
        articles = f.read().splitlines()

    return articles

## 15. Badanie podobieństw między tagami

### Klasteryzacja słów za pomocą k-means

In [23]:
def group_by_second(lst):
    return {k: [x for x, y in lst if y == k] for k in {y for x, y in lst}}

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(tags_unique)

kmeans = KMeans(n_clusters=10)
kmeans.fit(X)

labels = kmeans.labels_

word_and_group = list(zip(tags_unique, labels))

In [25]:
result = group_by_second(word_and_group)

In [26]:
for key in result.keys():
    print(result[key])
    print()

['star formation', 'neutron star', 'carbon star', 'eclipsing star systems', 'star evolution', 'star catalogues', 'star systems', 'star cluster', 'binary star', 'star maps', 'variable star', 'star', 'multiple star systems', 'star gazing', 'runaway star']

['horizon', 'event horizon', 'event horizon telescope', 'cosmological horizon', 'very high energetic event']

['rogue planet', 'standards', 'active galaxy', 'drizzle', 'm31', 'map', 'transits other than sun', 'brightness', 'big rip', 'planetary science', 'nova', 'image processing', 'orbital resonance', 'frame of reference', 'infrared', 'lunar eclipse', 'great red spot', 'spectral type', 'luminosity', 'light curve', 'detectors', 'iau', 'muon', 'magellanic cloud', 'standard candle', 'vera c rubin observatory', 'asteroid belt', 'orbit', 'serendipity', 'spectral', 'callisto', 'albedo', 'kuiper belt', 'neptune', 'differentiation', 'infinite', 'escape velocity', 'notation', 'gamma ray bursts', 'doppler effect', 'element', 'sun rays', 'edding

### Model semantyczny: Można użyć modelu semantycznego, takiego jak word2vec lub GloVe, aby porównać semantyczne relacje między słowami i zobaczyć, czy są blisko siebie.

In [30]:
def get_similar_words(word):
    similar_words = model.wv.most_similar(positive=[word])
    filtered_words = [word[0] for word in similar_words if word[1] > 0.6]
    return filtered_words

In [31]:
import gensim
from gensim.models import Word2Vec

astronomy_articles = [text]

model = Word2Vec(word_list, vector_size=100, window=5, min_count=1, workers=4)

In [32]:
for word in tags_unique:
    similar_words = get_similar_words(word)
    print("Słowo:", word)
    print("Podobne słowa:", similar_words)

KeyError: "Key 'rogue planet' not present in vocabulary"

### Słownik synonimów: Można użyć gotowego słownika synonimów, aby zobaczyć, czy dane słowa są zdefiniowane jako synonimy. Po prostu sprawdzam synonimy dla tego słowa, niekoniecznie podobieńśtwa z lista tagów

In [None]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet

In [33]:
def get_synonyms(word):
    synonyms = []
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.append(lemma.name())
    return set(synonyms)

In [34]:
for word in tags_unique:
    synonyms = get_synonyms(word)
    if len(synonyms) != 0:
        print("Słowo:", word)
        print("Synonimy:", synonyms)
        print()

Słowo: standards
Synonimy: {'monetary_standard', 'measure', 'touchstone', 'criterion', 'banner', 'standard'}

Słowo: drizzle
Synonimy: {'drizzle', 'mizzle', 'moisten'}

Słowo: map
Synonimy: {'map', 'function', 'single-valued_function', 'mapping', 'map_out', 'mathematical_function', 'represent'}

Słowo: interstellar
Synonimy: {'interstellar'}

Słowo: brightness
Synonimy: {'light', 'luminance', 'brightness', 'smartness', 'brightness_level', 'luminosity', 'luminousness', 'cleverness'}

Słowo: nova
Synonimy: {'nova'}

Słowo: infrared
Synonimy: {'infrared_emission', 'infrared_frequency', 'infrared_radiation', 'infrared', 'infrared_light'}

Słowo: space
Synonimy: {'distance', 'blank', 'quad', 'space', 'blank_space', 'place', 'outer_space', 'infinite'}

Słowo: luminosity
Synonimy: {'light', 'luminance', 'brightness', 'brightness_level', 'luminosity', 'luminousness'}

Słowo: detectors
Synonimy: {'sensing_element', 'demodulator', 'detector', 'sensor'}

Słowo: muon
Synonimy: {'mu-meson', 'muon',