In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import nltk 
import pandas as pd
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
path = '/content/drive/MyDrive'

df = pd.read_csv(path + '/data_cleaned.csv')
df['Text_Summary'] = ''
df['Cleaned_Text_Summary'] = ''

In [6]:
df.head()

Unnamed: 0,Date,Title,Abstract,Keywords,URL,Text,Year,Volume#,Issue#,Month,Title Length,Abstract Length,Text Length,Number of Keywords,Text_Cleaned,Abstract_Cleaned,Keywords_Cleaned,Text_Summary,Cleaned_Text_Summary
0,8 January 2021,A Systematic Literature Review on English and ...,Due to the enormous growth of information and ...,"English Bangla Comparison, Latent Dirichlet Al...",https://thescipub.com/pdf/jcssp.2021.1.18.pdf,Because of the rapid development of Informatio...,2021,17,1,January,67,2773,48221,6,rapid development information technology eg. ...,due enormous growth information technology di...,"english bangla comparison, latent dirichlet al...",,
1,21 January 2021,DAD: A Detailed Arabic Dataset for Online Text...,This paper presents a novel Arabic dataset tha...,"Arabic Dataset, Arabic Benchmark, Arabic Recog...",https://thescipub.com/pdf/jcssp.2021.19.32.pdf,"In the literature, many papers that focus on A...",2021,17,1,January,96,2553,37984,9,literature many paper focus arabic text recog...,paper present novel arabic dataset consider ch...,"arab dataset, arab benchmark, arab recognition...",,
2,20 January 2021,Collision Avoidance Modelling in Airline Traff...,An Air Traffic Controller (ATC) system aims to...,"Air Traffic Control, Collision Avoidance, Conf...",https://thescipub.com/pdf/jcssp.2021.33.43.pdf,Collision avoidance on air traffic becomes ver...,2021,17,1,January,109,3375,30346,4,collision avoidance air traffic become importa...,air traffic controller atc system aim manage...,"air traffic control, collis avoidance, conflic...",,
3,20 January 2021,Fine-Tuned MobileNet Classifier for Classifica...,"This paper proposed an accurate, fast and reli...","Strawberry, Cherry Fruit, Accuracy, MobileNet,...",https://thescipub.com/pdf/jcssp.2021.44.54.pdf,"In recent years, farmers in India eventually l...",2021,17,1,January,87,3283,29159,5,recent year farmer india eventually lose yiel...,paper propose accurate fast reliable strawber...,"strawberry, cherri fruit, accuracy, mobilenet,...",,
4,21 January 2021,A Content Filtering from Spam Posts on Social ...,The system for filtering spam posts on social ...,"Content Filtering, Spam Detection, Multimodal ...",https://thescipub.com/pdf/jcssp.2021.55.66.pdf,Spam is the use of electronic devices to trans...,2021,17,1,January,86,2745,30537,5,spam use electronic device transmit non relev...,system filtering spam post social medium prefe...,"content filtering, spam detection, multimod da...",,


In [7]:
text = df.loc[2].at['Text']
print(text)

Collision avoidance on air traffic becomes very important to be investigated as the Air Traffic Control (ATC) system aims to increase the safety of the airplane passengers. The Collision Avoidance (CA) has been studied in many researches on Detection and Resolution (CDR). The general solution for CA uses three types of manoeuvres on an airplane, namely speed, altitude and angle of direction of flight. The aim of the CDR is to create a standard procedure to help the airplane controller and pilot when action to prevent conflicts is not successful. For example, when two or more airplanes violate the specified safety criteria to maintain the minimum horizontal distance between airplanes by 5 Nm or the minimum vertical distance by 1,000 ft. This criterion is referred to as the Protected Zone (PZ). The proposed solution must be able to maintain a predetermined flight schedule.  One solution to this problem is to consider speed and altitude manoeuvres. The model that uses this solution is cal

In [8]:
def _create_frequency_table(text_string) -> dict:

    stopWords = set(stopwords.words("english"))
    words = word_tokenize(text_string)
    ps = PorterStemmer()

    freqTable = dict()
    for word in words:
        word = ps.stem(word)
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1

    return freqTable

In [9]:
def _score_sentences(sentences, freqTable) -> dict:
    sentenceValue = dict()

    for sentence in sentences:
        word_count_in_sentence = (len(word_tokenize(sentence)))
        for wordValue in freqTable:
            if wordValue in sentence.lower():
                if sentence[:10] in sentenceValue:
                    sentenceValue[sentence[:10]] += freqTable[wordValue]
                else:
                    sentenceValue[sentence[:10]] = freqTable[wordValue]

        sentenceValue[sentence[:10]] = sentenceValue[sentence[:10]] // word_count_in_sentence

    return sentenceValue

In [10]:

def _find_average_score(sentenceValue) -> int:
    sumValues = 0
    for entry in sentenceValue:
        sumValues += sentenceValue[entry]

    # Average value of a sentence from original text
    average = int(sumValues / len(sentenceValue))

    return average

In [11]:
def _generate_summary(sentences, sentenceValue, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:10] in sentenceValue and sentenceValue[sentence[:10]] > (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [12]:
def summarize(text): 
    # 1 Create the word frequency table
    freq_table = _create_frequency_table(text)

    '''
    We already have a sentence tokenizer, so we just need 
    to run the sent_tokenize() method to create the array of sentences.
    '''

    # 2 Tokenize the sentences
    sentences = sent_tokenize(text)

    # 3 score the sentences
    sentence_scores = _score_sentences(sentences, freq_table)

    # 4 Find the threshold
    threshold = _find_average_score(sentence_scores)

    # 5 Generate the summary
    summary = _generate_summary(sentences, sentence_scores, 1.5 * threshold)

    print(summary)

In [13]:
print(summarize(text))

 Pallottino et al. (2002)  Yudhi Purwananto et al. Alonso-Ayuso et al. Alonso-Ayuso et al. (2011) created groups of airplanes based on altitude levels with 1,000 ft distance between the groups. Alonso-Ayuso et al. CPLEX is also a Mixed-Integer Programming (MIP) solver. Yudhi Purwananto et al. The safe distance is measured in NM (Pallottino et al., 2002). 1a. 2a). 1a) is avoided. 1b). 1c). 1d). 2b). 3a). 3a). (a) (b)  Fig. 4a). 4b). 1d).
None


In [None]:
abstract = df.loc[2].at['Text']

In [None]:
for index, row in df.iterrows(): 
    
    text = row['Text']
    summary = summarize(text)
    df.loc[index, 'Text_Summary'] = summary
    
    text = row['Text_Cleaned']
    summary = summarize(text)
    df.loc[index, 'Cleaned_Text_Summary'] = summary

 That is why both are fair for all topic models (Cheng et al., 2014). Md. 1. 2. From Fig. 4. Latent Dirichlet Allocation (LDA) is the most used method for topic modeling. Fig. Modified LDA 2009  Ramage et al. (2009a; 2009b)  -  2014  Hu et al. (2015)  -  2018 Gao et al. (2018; Alkhodair et al., 2018)  -  2019  Shi et al. (2019)  Hasan et al. (2019) LSA  2017  -  Chowdhury et al. (2018)  -  2019  Uteuov (2019)  - BTM 2014  Cheng et al. (2014) -  2016  Pang et al. (2016)  -  2019  Li et al. (2015) Naïve Bayes 2013  Arora et al. (2016) -  2017  Li et al. (2017)  - PDMM, GPUPDMM 2017  Li et al. (2017)  - Topic Mapping  2019  Shi et al. (2020) HDP  2018  Shovkun et al. (2018)  -  2019  Shi et al. (2019; Bertalan and Ruiz, 2019)  - Fuzzy approach  2018  Karami et al. (2018)  -  2019  Rashid et al. (2016)  - SATM  2015  Quan et al. (2015)  - PTM & SPTM  2016  Zuo et al. (2016)  - PDM  2017  Jiang et al. (2017)  - D ETM  2019  Dieng et al. (2018)  - JST  2015  Lin et al. (2015)  - VSM  2018  -

KeyError: ignored

In [None]:
df.head()

Unnamed: 0,Date,Title,Abstract,Keywords,URL,Text,Year,Volume#,Issue#,Month,Title Length,Abstract Length,Text Length,Number of Keywords,Text_Cleaned,Abstract_Cleaned,Keywords_Cleaned,Text_Summary,Cleaned_Text_Summary
0,8 January 2021,A Systematic Literature Review on English and ...,Due to the enormous growth of information and ...,"English Bangla Comparison, Latent Dirichlet Al...",https://thescipub.com/pdf/jcssp.2021.1.18.pdf,Because of the rapid development of Informatio...,2021,17,1,January,67,2773,48221,6,rapid development information technology eg. ...,due enormous growth information technology di...,"english bangla comparison, latent dirichlet al...",,
1,21 January 2021,DAD: A Detailed Arabic Dataset for Online Text...,This paper presents a novel Arabic dataset tha...,"Arabic Dataset, Arabic Benchmark, Arabic Recog...",https://thescipub.com/pdf/jcssp.2021.19.32.pdf,"In the literature, many papers that focus on A...",2021,17,1,January,96,2553,37984,9,literature many paper focus arabic text recog...,paper present novel arabic dataset consider ch...,"arab dataset, arab benchmark, arab recognition...",,
2,20 January 2021,Collision Avoidance Modelling in Airline Traff...,An Air Traffic Controller (ATC) system aims to...,"Air Traffic Control, Collision Avoidance, Conf...",https://thescipub.com/pdf/jcssp.2021.33.43.pdf,Collision avoidance on air traffic becomes ver...,2021,17,1,January,109,3375,30346,4,collision avoidance air traffic become importa...,air traffic controller atc system aim manage...,"air traffic control, collis avoidance, conflic...",,
3,20 January 2021,Fine-Tuned MobileNet Classifier for Classifica...,"This paper proposed an accurate, fast and reli...","Strawberry, Cherry Fruit, Accuracy, MobileNet,...",https://thescipub.com/pdf/jcssp.2021.44.54.pdf,"In recent years, farmers in India eventually l...",2021,17,1,January,87,3283,29159,5,recent year farmer india eventually lose yiel...,paper propose accurate fast reliable strawber...,"strawberry, cherri fruit, accuracy, mobilenet,...",,
4,21 January 2021,A Content Filtering from Spam Posts on Social ...,The system for filtering spam posts on social ...,"Content Filtering, Spam Detection, Multimodal ...",https://thescipub.com/pdf/jcssp.2021.55.66.pdf,Spam is the use of electronic devices to trans...,2021,17,1,January,86,2745,30537,5,spam use electronic device transmit non relev...,system filtering spam post social medium prefe...,"content filtering, spam detection, multimod da...",,


In [None]:
df.to_csv('data_with_summaries.csv', index = False)

In [None]:
text = df.loc[0].at['Text']
summary = df.loc[0].at['Text_Summary']
print(summary)

None


In [None]:
df.at['0', 'Text_Summary'] = '0'
df.head()

Unnamed: 0,Date,Title,Abstract,Keywords,URL,Text,Year,Volume#,Issue#,Month,Title Length,Abstract Length,Text Length,Number of Keywords,Text_Cleaned,Abstract_Cleaned,Keywords_Cleaned,Text_Summary,Cleaned_Text_Summary
0,8 January 2021,A Systematic Literature Review on English and ...,Due to the enormous growth of information and ...,"English Bangla Comparison, Latent Dirichlet Al...",https://thescipub.com/pdf/jcssp.2021.1.18.pdf,Because of the rapid development of Informatio...,2021.0,17.0,1.0,January,67.0,2773.0,48221.0,6.0,rapid development information technology eg. ...,due enormous growth information technology di...,"english bangla comparison, latent dirichlet al...",,
1,21 January 2021,DAD: A Detailed Arabic Dataset for Online Text...,This paper presents a novel Arabic dataset tha...,"Arabic Dataset, Arabic Benchmark, Arabic Recog...",https://thescipub.com/pdf/jcssp.2021.19.32.pdf,"In the literature, many papers that focus on A...",2021.0,17.0,1.0,January,96.0,2553.0,37984.0,9.0,literature many paper focus arabic text recog...,paper present novel arabic dataset consider ch...,"arab dataset, arab benchmark, arab recognition...",,
2,20 January 2021,Collision Avoidance Modelling in Airline Traff...,An Air Traffic Controller (ATC) system aims to...,"Air Traffic Control, Collision Avoidance, Conf...",https://thescipub.com/pdf/jcssp.2021.33.43.pdf,Collision avoidance on air traffic becomes ver...,2021.0,17.0,1.0,January,109.0,3375.0,30346.0,4.0,collision avoidance air traffic become importa...,air traffic controller atc system aim manage...,"air traffic control, collis avoidance, conflic...",,
3,20 January 2021,Fine-Tuned MobileNet Classifier for Classifica...,"This paper proposed an accurate, fast and reli...","Strawberry, Cherry Fruit, Accuracy, MobileNet,...",https://thescipub.com/pdf/jcssp.2021.44.54.pdf,"In recent years, farmers in India eventually l...",2021.0,17.0,1.0,January,87.0,3283.0,29159.0,5.0,recent year farmer india eventually lose yiel...,paper propose accurate fast reliable strawber...,"strawberry, cherri fruit, accuracy, mobilenet,...",,
4,21 January 2021,A Content Filtering from Spam Posts on Social ...,The system for filtering spam posts on social ...,"Content Filtering, Spam Detection, Multimodal ...",https://thescipub.com/pdf/jcssp.2021.55.66.pdf,Spam is the use of electronic devices to trans...,2021.0,17.0,1.0,January,86.0,2745.0,30537.0,5.0,spam use electronic device transmit non relev...,system filtering spam post social medium prefe...,"content filtering, spam detection, multimod da...",,


In [None]:
abstract = df.loc[2].at['Abstract']

In [None]:
print(abstract)

An Air Traffic Controller (ATC) system aims to manage airline traffic to prevent collision of the airplane, called the Collision Avoidance (CA). The study on CA, called Conflict Detection and Resolution (CDR), becomes more critical as the airline traffic has grown each year significantly. Previous studies used optimization algorithms for CDR and did not involve the presence of cumulonimbus clouds. Many such clouds can be found in tropical regions like in Indonesia. Therefore, involving such clouds in the CDR optimization algorithms will be significant in Indonesia. We developed a CDR-based CA modelling that involves the Cumulonimbus (CB) clouds by considering three airplane maneuvers, i.e., Velocity, angle Turn and Altitude level Change (VTAC). Our optimization algorithm is developed based on a Mixed-Integer Programming (MIP) solver due to its efficiency. This proposed algorithm requires two input data, namely the initial airplane and cloud states input and the flight parameter such as

Google T5 Model 

In [14]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 37.0 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 58.6 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transfo

In [15]:
import torch 
from transformers import AutoTokenizer, AutoModelWithLMHead

In [16]:
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict = True)

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/850M [00:00<?, ?B/s]

In [17]:
text = df.loc[2].at['Text']
print(text)

Collision avoidance on air traffic becomes very important to be investigated as the Air Traffic Control (ATC) system aims to increase the safety of the airplane passengers. The Collision Avoidance (CA) has been studied in many researches on Detection and Resolution (CDR). The general solution for CA uses three types of manoeuvres on an airplane, namely speed, altitude and angle of direction of flight. The aim of the CDR is to create a standard procedure to help the airplane controller and pilot when action to prevent conflicts is not successful. For example, when two or more airplanes violate the specified safety criteria to maintain the minimum horizontal distance between airplanes by 5 Nm or the minimum vertical distance by 1,000 ft. This criterion is referred to as the Protected Zone (PZ). The proposed solution must be able to maintain a predetermined flight schedule.  One solution to this problem is to consider speed and altitude manoeuvres. The model that uses this solution is cal

In [18]:
input = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=1024, truncation=True)

In [19]:
output = model.generate(input, max_length=250, min_length=25, length_penalty=5, num_beams=2)

In [20]:
tokenizer.decode(output[0])

'<pad> air traffic control system aims to increase the safety of the airplane passengers. the aim of the CDR is to create a standard procedure to help the airplane controller. the proposed solution is called the Velocity and Altitude Change (VAC) model. it uses a mixed-integer linear optimization (MILO) approach to avoid conflicts.</s>'

In [21]:
import torch 
from transformers import AutoTokenizer, AutoModelWithLMHead

tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict = True)

def summarize_t5base(text, model, tokenizer):

  #tokenizer = AutoTokenizer.from_pretrained('t5-base')
  #model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict = True)
  input = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=1024, truncation=True)
  output = model.generate(input, max_length=250, min_length=25, length_penalty=5, num_beams=2)
  final_output = tokenizer.decode(output[0])

  return final_output




In [22]:
text = df.loc[2].at['Text']
output = summarize_t5base(text, model, tokenizer)
print(output)

<pad> air traffic control system aims to increase the safety of the airplane passengers. the aim of the CDR is to create a standard procedure to help the airplane controller. the proposed solution is called the Velocity and Altitude Change (VAC) model. it uses a mixed-integer linear optimization (MILO) approach to avoid conflicts.</s>
