In [1]:
from kpe import graph_based_methods as gr
from kpe import statistical_based_mthods as stats
from kpe import neural_networks_based_methods as nn

In [2]:
deeplearning = """Deep learning (also known as deep structured learning) is part of a broader family of machine
    learning methods based on artificial neural networks with representation learning. Learning can be supervised,
    semi-supervised or unsupervised. Deep-learning architectures such as deep neural networks, deep belief networks, 
    deep reinforcement learning, recurrent neural networks and convolutional neural networks have been applied to 
    fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, 
    drug design, medical image analysis, climate science, material inspection and board game programs, where they have
    produced results comparable to and in some cases surpassing human expert performance. Artificial neural networks
    (ANNs) were inspired by information processing and distributed communication nodes in biological systems. ANNs have 
    various differences from biological brains. Specifically, artificial neural networks tend to be static and symbolic,
    while the biological brain of most living organisms is dynamic (plastic) and analogue
    """ 

nlp = """Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence 
    concerned with the interactions between computers and human language, in particular how to program computers to process
    and analyze large amounts of natural language data. The goal is a computer capable of "understanding" the contents of 
    documents, including the contextual nuances of the language within them. The technology can then accurately extract 
    information and insights contained in the documents as well as categorize and organize the documents themselves."""


iran = """The history of Iran is intertwined with the history of a larger region known as Greater Iran, comprising the 
    area from Anatolia in the west to the borders of Ancient India and the Syr Darya in the east, and from the Caucasus 
    and the Eurasian Steppe in the north to the Persian Gulf and the Gulf of Oman in the south. Central to this area is 
    Iran, commonly known until the mid-20th century as Persia in the Western world.Iran is home to one of the world's 
    oldest continuous major civilizations, with historical and urban settlements dating back to 4000 BC.[1] The south-western
    and western part of the Iranian plateau participated in the traditional ancient Near East with Elam (3200–539 BC), 
    from the Bronze Age, and later with various other peoples, such as the Kassites, Mannaeans, and Gutians. Georg Wilhelm 
    Friedrich Hegel calls the Persians the "first Historical People".[2] The Medes unified Iran as a nation and empire 
    in 625 BC.[3] The Achaemenid Empire (550–330 BC), founded by Cyrus the Great, was the first true global superpower
    state[4] and it ruled from the Balkans to North Africa and also Central Asia, spanning three continents, from their
    seat of power in Persis (Persepolis). It was the largest empire yet seen and the first world empire.[5] The Achaemenid 
    Empire was the only civilization in all of history to connect over 40% of the global population, accounting for
    approximately 49.4 million of the world's 112.4 million people in around 480 BC.[6] They were succeeded by the 
    Seleucid, Parthian, and Sasanian Empires, who successively governed Iran for almost 1,000 years and made Iran once 
    again a leading power in the world. Persia's arch-rival was the Roman Empire and its successor, the Byzantine Empire."""


# the first two document are quit clean but the last one is noisy and longer than the others. so extracting keywords 
# from that is a bit more challenging.

<h2>1. Graph based methods.</h2>

In [3]:
text_rank = gr.TextRank()
pos_rank  = gr.SingleRank()
frake = gr.Frake()

In [4]:
doc1_cleaned = gr.TextRank.preprocess(deeplearning)

kws_1_1 = text_rank.extract(doc1_cleaned, top=5)
kws_2_1 = pos_rank.extract(doc1_cleaned, top=5 )
kws_3_1 = frake.extract(doc1_cleaned, top=5, )

for kws, method in zip([kws_1_1, kws_2_1, kws_3_1], ["TextRank", "PositionalRank", "Frake"]):
    print(f"-Keywords extracted from doc1 with [{method}]: ")
    for k, s in kws.items():
        print(f"\t{k:25}: {round(s, 4)}")
    print()

-Keywords extracted from doc1 with [TextRank]: 
	deep learning            : 5.0539
	deep neural              : 4.5581
	learning                 : 3.125
	biological               : 2.7
	neural                   : 2.6292

-Keywords extracted from doc1 with [PositionalRank]: 
	deep learning            : 4.9874
	neural networks          : 4.8683
	deep neural              : 4.7718
	learning methods         : 4.2127
	machine learning         : 4.071

-Keywords extracted from doc1 with [Frake]: 
	deep learning machine neural networks: 47.98
	artificial networks biological: 23.43
	artificial neural biological: 20.86
	learning machine         : 16.76
	networks                 : 13.51



NetworkX version 3.0.
  M = google_matrix(


In [5]:
doc2_cleaned = gr.TextRank.preprocess(nlp)

kws_1_2 = text_rank.extract(doc2_cleaned, top=5)
kws_2_2 = pos_rank.extract(doc2_cleaned, top=5 )
kws_3_2 = frake.extract(doc2_cleaned, top=5, )

for kws, method in zip([kws_1_2, kws_2_2, kws_3_2], ["TextRank", "PositionalRank", "Frake"]):
    print(f"-Keywords extracted from doc2 with [{method}]: ")
    for k, s in kws.items():
        print(f"\t{k:25}: {round(s, 4)}")
    print()

-Keywords extracted from doc2 with [TextRank]: 
	language processing      : 4.3375
	language                 : 3.125
	computer                 : 2.7
	large                    : 1.85
	intelligence             : 1.85

-Keywords extracted from doc2 with [PositionalRank]: 
	language processing      : 4.2237
	natural language         : 3.4799
	human language           : 3.4268
	language data            : 3.4268
	linguistics computer     : 3.1333

-Keywords extracted from doc2 with [Frake]: 
	language computer        : 15.02
	language                 : 10.74
	documents                : 4.87
	computers                : 4.61
	computer                 : 4.28



NetworkX version 3.0.
  M = google_matrix(


In [6]:
doc3_cleaned = gr.TextRank.preprocess(iran)

kws_1_3 = text_rank.extract(doc3_cleaned, top=5)
kws_2_3 = pos_rank.extract(doc3_cleaned, top=5 )
kws_3_3 = frake.extract(doc3_cleaned, top=5)

for kws, method in zip([kws_1_3, kws_2_3, kws_3_3], ["TextRank", "PositionalRank", "Frake"]):
    print(f"-Keywords extracted from doc3 with [{method}]: ")
    for k, s in kws.items():
        print(f"\t{k:25}: {round(s, 4)}")
    print()

-Keywords extracted from doc3 with [TextRank]: 
	empire                   : 3.125
	settlements              : 1.85
	major                    : 1.85
	asia                     : 1.85
	western                  : 1.85

-Keywords extracted from doc3 with [PositionalRank]: 
	world empire             : 3.7583
	achaemenid empire        : 3.625
	roman empire             : 3.475
	byzantine empire         : 3.375
	empire                   : 3.125

-Keywords extracted from doc3 with [Frake]: 
	achaemenid empire bc global: 27.87
	iran bc                  : 22.49
	iran known area          : 18.47
	empire                   : 12.81
	iran                     : 12.51



NetworkX version 3.0.
  M = google_matrix(


<h2>2. Statistical based methods.</h2>

In [7]:
tfidf = stats.TFIDF(path_to_docs="./sample_docs/")
yake  = stats.Yake(top=5)

reading documents ...


100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:16<00:00,  1.79it/s]


In [8]:
kws_4_1 = tfidf.extract(deeplearning, top=5)
kws_5_1 = yake.extract(deeplearning)

for kws, method in zip([kws_4_1, kws_5_1], ["TF-IDF", "Yake"]):
    print(f"-Keywords extracted from doc1 with [{method}]: ")
    for k, s in kws.items():
        print(f"\t{k:25}: {round(s, 4)}")
    print()

-Keywords extracted from doc1 with [TF-IDF]: 
	neural networks deep     : 1.0935
	deep neural networks     : 1.0935
	supervised deep learning : 1.0242
	learning learning        : 0.9601
	learning recurrent neural: 0.9558

-Keywords extracted from doc1 with [Yake]: 
	artificial neural networks: 0.0003
	specifically artificial neural: 0.0006
	recurrent neural networks: 0.0008
	convolutional neural networks: 0.0008
	neural networks tend     : 0.0008



In [9]:
kws_4_1 = tfidf.extract(nlp, top=5)
kws_5_2 = yake.extract(nlp)

for kws, method in zip([kws_4_1, kws_5_2], ["TF-IDF", "Yake"]):
    print(f"-Keywords extracted from doc2 with [{method}]: ")
    for k, s in kws.items():
        print(f"\t{k:25}: {round(s, 4)}")
    print()

-Keywords extracted from doc2 with [TF-IDF]: 
	natural language processing: 0.7559
	natural language         : 0.642
	language processing      : 0.5695
	human language           : 0.558
	documents                : 0.5137

-Keywords extracted from doc2 with [Yake]: 
	natural language data    : 0.0006
	artificial intelligence concerned: 0.0007
	analyze large amounts    : 0.0007
	accurately extract information: 0.0007
	natural language processing: 0.0008



In [10]:
kws_4_3 = tfidf.extract(iran, top=5)
kws_5_3 = yake.extract(iran)

for kws, method in zip([kws_4_3, kws_5_3], ["TF-IDF", "Yake"]):
    print(f"-Keywords extracted from doc3 with [{method}]: ")
    for k, s in kws.items():
        print(f"\t{k:25}: {round(s, 4)}")
    print()

-Keywords extracted from doc3 with [TF-IDF]: 
	western world            : 0.558
	south western            : 0.463
	million people           : 0.342
	historical people        : 0.342
	south west               : 0.3248

-Keywords extracted from doc3 with [Yake]: 
	greater iran comprising  : 0.0002
	medes unified iran       : 0.0002
	successively governed iran: 0.0002
	worlds oldest continuous : 0.0002
	true global superpower   : 0.0003



  tri_ = pd.Series(tri_).sort_values(ascending=False).iloc[:2*2]


<h2>3. Transformer based method</h2>

In [11]:
transformer = nn.KPESentenceTransformer()

loading model: distilbert-base-nli-mean-tokens


In [12]:
kws_6_1 = transformer.extract(deeplearning)

for kws, method in zip([kws_6_1], ["Transformer"]):
    print(f"-Keywords extracted from doc1 with [{method}]: ")
    for k in kws:
        print(f"\t{k}")
    print()

-Keywords extracted from doc1 with [Transformer]: 
	deep learning
	machine translation
	learning architectures
	deep neural
	bioinformatics drug



In [13]:
kws_6_2 = transformer.extract(nlp)

for kws, method in zip([kws_6_2], ["Transformer"]):
    print(f"-Keywords extracted from doc1 with [{method}]: ")
    for k in kws:
        print(f"\t{k}")
    print()

-Keywords extracted from doc1 with [Transformer]: 
	computer capable
	information insights
	subfield linguistics
	linguistics
	computer science



In [14]:
kws_6_3 = transformer.extract(iran)

for kws, method in zip([kws_6_3], ["Transformer"]):
    print(f"-Keywords extracted from doc1 with [{method}]: ")
    for k in kws:
        print(f"\t{k}")
    print()

-Keywords extracted from doc1 with [Transformer]: 
	western iranian
	world iran
	largest empire
	civilizations historical
	north persian

