In [1]:
import findspark
from pyspark import SparkContext
from pyspark.sql import SparkSession

from src.minhash_pyspark import PySparkMinHashLSH

findspark.init()  # It would crash without this

sc = SparkContext(master="local[*]", appName="MinHasher")
sqlContext = SparkSession.builder.getOrCreate()

text_file = "WebOfScience-5736.txt"

key_short = "Phytoplasmas are insect-vectored bacteria that cause disease in a wide range of plant species. The increasing availability of molecular DNA analyses, expertise and additional methods in recent years has led to a proliferation of discoveries of phytoplasma-plant"
key_long = r"Phytoplasmas are insect-vectored bacteria that cause disease in a wide range of plant species. The increasing availability of molecular DNA analyses, expertise and additional methods in recent years has led to a proliferation of discoveries of phytoplasma-plant host associations and in the numbers of taxonomic groupings for phytoplasmas. The widespread use of common names based on the diseases with which they are associated, as well as separate phenetic and taxonomic systems for classifying phytoplasmas based on variation at the 16S rRNA-encoding gene, complicates interpretation of the literature. We explore this issue and related trends through a focus on Australian pathosystems, providing the first comprehensive compilation of information for this continent, covering the phytoplasmas, host plants, vectors and diseases. Of the 33 16Sr groups reported internationally, only groups I, II, III, X, XI and XII have been recorded in Australia and this highlights the need for ongoing biosecurity measures to prevent the introduction of additional pathogen groups. Many of the phytoplasmas reported in Australia have not been sufficiently well studied to assign them to 16Sr groups so it is likely that unrecognized groups and sub-groups are present. Wide host plant ranges are apparent among well studied phytoplasmas, with multiple crop and non-crop species infected by some. Disease management is further complicated by the fact that putative vectors have been identified for few phytoplasmas, especially in Australia. Despite rapid progress in recent years using molecular approaches"
key_full = 'Phytoplasmas are insect-vectored bacteria that cause disease in a wide range of plant species. The increasing availability of molecular DNA analyses, expertise and additional methods in recent years has led to a proliferation of discoveries of phytoplasma-plant host associations and in the numbers of taxonomic groupings for phytoplasmas. The widespread use of common names based on the diseases with which they are associated, as well as separate phenetic and taxonomic systems for classifying phytoplasmas based on variation at the 16S rRNA-encoding gene, complicates interpretation of the literature. We explore this issue and related trends through a focus on Australian pathosystems, providing the first comprehensive compilation of information for this continent, covering the phytoplasmas, host plants, vectors and diseases. Of the 33 16Sr groups reported internationally, only groups I, II, III, X, XI and XII have been recorded in Australia and this highlights the need for ongoing biosecurity measures to prevent the introduction of additional pathogen groups. Many of the phytoplasmas reported in Australia have not been sufficiently well studied to assign them to 16Sr groups so it is likely that unrecognized groups and sub-groups are present. Wide host plant ranges are apparent among well studied phytoplasmas, with multiple crop and non-crop species infected by some. Disease management is further complicated by the fact that putative vectors have been identified for few phytoplasmas, especially in Australia. Despite rapid progress in recent years using molecular approaches, phytoplasmas remain the least well studied group of plant pathogens, making them a "crouching tiger" disease threat.'
key_middle = r"pathosystems, providing the first comprehensive compilation of information for this continent, covering the phytoplasmas, host plants, vectors and diseases. Of the 33 16Sr groups reported internationally, only groups I, II, III, X, XI and XII have been recorded in Australia and this highlights the need for ongoing biosecurity measures to prevent the introduction of additional pathogen groups. Many of the phytoplasmas reported in Australia have not been sufficiently well studied to assign them to 16Sr groups so it is likely that unrecognized groups and sub-groups are present. Wide host plant ranges"
key_end = r'Despite rapid progress in recent years using molecular approaches, phytoplasmas remain the least well studied group of plant pathogens, making them a "crouching tiger" disease threat.'

key_final = r"This paper focuses on a new kind of artificial neural networks - the Z-transform artificial neural networks (ZTANNs). It is proposed to use the Z-transform instead of ordinary weights and a linear activation function of an artificial neuron."
key_out = "The quick brown fox jumped over the lazy dog. Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."


In [2]:
wos_spark = PySparkMinHashLSH.read_from_txt(text_file, sc, sqlContext)
wos_spark.run()

result = wos_spark.approxNearestNeighbors(key_short, 10)
result.show()
wos_spark.free_dfs()

2024-08-30 08:04:00,870 - Reading the file
2024-08-30 08:04:04,609 - Load success. Received [5736] WebOfScience-5736.txt. Took 1.82310 s
2024-08-30 08:04:04,714 - Building shing_dict


Memory usage - shing_dict: 0.00390/2 GB [3.99399 MB]


2024-08-30 08:04:32,576 - Shing_dict[len=464294] build is done. Took 27.86253 s.
2024-08-30 08:04:33,287 - Precomputing minhashes
2024-08-30 08:04:39,858 - Precomputed 28393 * 100 minhashes. Took 7.28060 s
2024-08-30 08:04:41,289 - Caching minhash_df and lsh_df
2024-08-30 08:04:41,369 - Caching done
2024-08-30 08:04:41,370 - LSH Actions Completed.
2024-08-30 08:06:01,449 - Found 4164 candicate documents
2024-08-30 08:06:01,534 - Collecting 10 results to driver
2024-08-30 08:06:06,405 - Collecting took 4.8691 s
2024-08-30 08:06:06,939 - Took 85.56931 s
2024-08-30 08:06:21,112 - Clearing minhash_df and lsh_df
2024-08-30 08:06:21,125 - Clearing done


+----------+--------------------+--------------------+
|        id|                text|             jaccard|
+----------+--------------------+--------------------+
|         0|Phytoplasmas are ...| 0.14000000059604645|
|8589934709|Purpose: The aim ...| 0.03999999910593033|
|       860|Grid voltage feed...| 0.03999999910593033|
|8589936024|Avian paramyxovir...| 0.03999999910593033|
|      1826|Background: The o...| 0.03999999910593033|
|8589936713|Silicon Photo-Mul...| 0.03999999910593033|
|       832|In recent years, ...|0.029999999329447746|
|8589935645|Development of bi...|0.029999999329447746|
|       313|One-step aldol co...|0.029999999329447746|
|8589936165|Podcasts are digi...|0.029999999329447746|
+----------+--------------------+--------------------+

