# **Load Papers from Scopus**

In [None]:
from google.colab import drive
drive.mount("/content/drive")

%cd /content/drive/MyDrive/Data
%ls

# requirement: scopus.csv (exported scopus search results) and languages.csv (Joshi low resource language classes) must be present in the Google Drive Data folder

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/Data
abstract-review.csv  dataset.csv          languages.csv        languages-included.csv  papers.csv
acl-anthology.tsv    dataset-updated.csv  languages-final.csv  methodology.csv         scopus.csv


In [None]:
import pandas as pd

scopus_df = pd.read_csv('scopus.csv')
scopus_df = scopus_df[scopus_df['year'] > 2014] # include only past 10 years
print('scopus papers count:', len(scopus_df))
scopus_df

scopus papers count: 964


Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords
0,scopus,"Behrooznia, Soheila (59342262200); Ansari, Ebr...",Enhancing Turkish Word Segmentation: A Focus o...,2024,This study addresses a challenge in morphologi...,,Translation (languages); 'current; Its efficie...
1,scopus,"Abdel-Salam, Reem (57222078275)",rematchka at ArabicNLU2024: Evaluating Large L...,2024,Natural Language Understanding (NLU) plays a v...,,Computational linguistics; Natural language pr...
2,scopus,"Jia, Yangji (56272028100); La, Maoji (59249059...",Multifeature BiLSTM-CRF model for Tibetan pers...,2024,Person name is an important component of Tibet...,BiLSTM-CRF model; component; named entity; tib...,Character recognition; Image processing; Knowl...
3,scopus,"Meyer, Francois (57207620101); Buys, Jan (5690...",A Systematic Analysis of Subwords and Cross-Li...,2024,Multilingual modelling can improve machine tra...,,Computational linguistics; Translation (langua...
4,scopus,"Yamin, Muh (57520111900); Sarno, Riyanarto (53...",Enhancing machine translation: syntax and sema...,2024,This research aimed at constructing an effecti...,hybrid MT; Machine translation; RBMT; Semantic...,
...,...,...,...,...,...,...,...
994,scopus,"Kartbayev, Amandyk (56875410200)",SMT: A case study of Kazakh-English word align...,2015,"In this paper, we present results from a set o...",Kazakh morphology; Machine translation; Word a...,Computational linguistics; Finite state transd...
995,scopus,"Brychcín, Tomáš (36633857400); Konopík, Milosl...",Latent semantics in language models,2015,This paper investigates three different source...,COALS; HAL; HPS; Language models; Latent Diric...,Coal; Computational linguistics; Computer aide...
996,scopus,"Ranta, Aarne (6603776726); Tian, Yan (55480203...","Chinese in the grammatical framework: Grammar,...",2015,Grammatical Framework (GF) is a grammar formal...,,Application programs; Computational linguistic...
997,scopus,"Kartbayev, Amandyk (56875410200)",Refining Kazakh word alignment using simulatio...,2015,Word alignment play an important role in the t...,Kazakh morphology; Machine translation; Optimi...,Computational linguistics; Computer aided lang...


# **Get Papers from ACL Anthology**

In [None]:
pip install acl-anthology # running this cell will cause the session to restart



In [None]:
from acl_anthology import Anthology
anthology = Anthology.from_repo()

Output()

In [None]:
acl_papers = 'database\tauthors\ttitle\tyear\tabstract\n'
search_words = ['morpholog', 'morphem', 'inflect', 'grammar based', 'fusional', 'agglutinat', 'polysynthetic']

for paper in anthology.papers():
  title = str(paper.title).lower()
  abstract = str(paper.abstract).lower()
  if ('translation' in title and 'language' in title and any(word in title for word in search_words)
     ) or (
      'translation' in abstract and 'language' in abstract and any(word in abstract for word in search_words)):
    acl_papers += f'acl-anthology\t{[f"{author.name.last}, {author.name.first}" for author in paper.authors]}\t{paper.title}\t{paper.parent.year}\t{paper.abstract}\n'

with open('acl-anthology.tsv', 'w') as f:
    f.write(acl_papers)

In [None]:
import pandas as pd

acl_df = pd.read_csv('acl-anthology.tsv', sep='\t')
acl_df = acl_df[acl_df['year'] > 2014] # include only past 10 years
print('acl-anthology papers count:', len(acl_df))
acl_df

acl-anthology papers count: 264


Unnamed: 0,database,authors,title,year,abstract
54,acl-anthology,"['Popovic, Maja', 'Arcan, Mihael']",Identifying main obstacles for statistical mac...,2015,
55,acl-anthology,"['Burlot, Franck', 'Yvon, François']",Morphology-aware alignments for translation to...,2015,
56,acl-anthology,"['Hewitt, John', 'Post, Matt', 'Yarowsky, David']",Automatic Construction of Morphologically Moti...,2016,Statistical Machine Translation (SMT) of highl...
57,acl-anthology,"['Burlot, Franck', 'Knyazeva, Elena', 'Lavergn...",Two-Step MT: Predicting Target Morphology,2016,This paper describes a two-step machine transl...
58,acl-anthology,"['Burlot, Franck', 'Labeau, Matthieu', 'Knyaze...",LIMSI@IWSLT’16: MT Track,2016,This paper describes LIMSI’s submission to the...
...,...,...,...,...,...
354,acl-anthology,"['Durgar El-Kahlout, İlknur', 'Bektaş, Emre', ...",Translating Between Morphologically Rich Langu...,2019,This paper introduces the work on building a m...
355,acl-anthology,"['Liu, Zihan', 'Xu, Yan', 'Winata, Genta Indra...",Incorporating Word and Subword Units in Unsupe...,2019,This paper describes CAiRE’s submission to the...
356,acl-anthology,"['Sánchez-Cartagena, Víctor M.', 'Pérez-Ortiz,...",The Universitat d’Alacant Submissions to the E...,2019,This paper describes the two submissions of Un...
357,acl-anthology,"['Toral, Antonio', 'Edman, Lukas', 'Yeshmagamb...",Neural Machine Translation for English–Kazakh ...,2019,This paper presents the systems submitted by t...


# **Combine Results and Remove Duplicates**

In [None]:
pip install Unidecode



In [None]:
from unidecode import unidecode

duplicates_df = pd.concat([scopus_df, acl_df]).fillna('')
duplicates_df = duplicates_df.map(lambda x: unidecode(str(x).lower()).replace("'",''))
combined_df = duplicates_df.drop_duplicates(subset=['title'], keep='first', ignore_index=True)

print('scopus paper count:', len(scopus_df))
print('acl-anthology paper count:', len(acl_df))
print('duplicates count:', len(duplicates_df) - len(combined_df))
print('total paper count:', len(combined_df))
combined_df

scopus paper count: 964
acl-anthology paper count: 264
duplicates count: 157
total paper count: 1071


Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords
0,scopus,"behrooznia, soheila (59342262200); ansari, ebr...",enhancing turkish word segmentation: a focus o...,2024,this study addresses a challenge in morphologi...,,translation (languages); current; its efficien...
1,scopus,"abdel-salam, reem (57222078275)",rematchka at arabicnlu2024: evaluating large l...,2024,natural language understanding (nlu) plays a v...,,computational linguistics; natural language pr...
2,scopus,"jia, yangji (56272028100); la, maoji (59249059...",multifeature bilstm-crf model for tibetan pers...,2024,person name is an important component of tibet...,bilstm-crf model; component; named entity; tib...,character recognition; image processing; knowl...
3,scopus,"meyer, francois (57207620101); buys, jan (5690...",a systematic analysis of subwords and cross-li...,2024,multilingual modelling can improve machine tra...,,computational linguistics; translation (langua...
4,scopus,"yamin, muh (57520111900); sarno, riyanarto (53...",enhancing machine translation: syntax and sema...,2024,this research aimed at constructing an effecti...,hybrid mt; machine translation; rbmt; semantic...,
...,...,...,...,...,...,...,...
1066,acl-anthology,"[lohar, pintu, popovic, maja, way, andy]",building english-to-serbian machine translatio...,2019,this paper reports the results of the first ex...,,
1067,acl-anthology,"[moryossef, amit, aharoni, roee, goldberg, yoav]",filling gender & number gaps in neural machine...,2019,when translating from a language that does not...,,
1068,acl-anthology,"[habash, nizar, bouamor, houda, chung, christine]",automatic gender identification and reinflecti...,2019,the impressive progress in many natural langua...,,
1069,acl-anthology,"[aiken, brad, kelly, jared, palmer, alexis, po...",sigmorphon 2019 task 2 system description pape...,2019,this paper presents the unt hilt+ling system f...,,


# **Identify "Low Resource" Search Terms**

In [None]:
search_words = ['low-resource', 'low resource', 'limited resource', 'less resource', 'resource scarce', 'scarcity', 'endangered']

def find_search_words(row, search_words):
  found_words = []
  for column in ['title', 'abstract', 'author keywords', 'index keywords']:
    for word in search_words:
      if word.lower() in row[column].lower() and word not in found_words:
        found_words.append(word)
  return ', '.join(found_words)

combined_df['low resource'] = combined_df.apply(lambda row: find_search_words(row, search_words), axis=1)
print('papers identified as "low resource":', (combined_df['low resource'].str.len() != 0).sum())
combined_df

papers identified as "low resource": 256


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['low resource'] = combined_df.apply(lambda row: find_search_words(row, search_words), axis=1)


Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords,low resource
0,scopus,"behrooznia, soheila (59342262200); ansari, ebr...",enhancing turkish word segmentation: a focus o...,2024,this study addresses a challenge in morphologi...,,translation (languages); current; its efficien...,
1,scopus,"abdel-salam, reem (57222078275)",rematchka at arabicnlu2024: evaluating large l...,2024,natural language understanding (nlu) plays a v...,,computational linguistics; natural language pr...,
2,scopus,"jia, yangji (56272028100); la, maoji (59249059...",multifeature bilstm-crf model for tibetan pers...,2024,person name is an important component of tibet...,bilstm-crf model; component; named entity; tib...,character recognition; image processing; knowl...,
3,scopus,"meyer, francois (57207620101); buys, jan (5690...",a systematic analysis of subwords and cross-li...,2024,multilingual modelling can improve machine tra...,,computational linguistics; translation (langua...,"low-resource, low resource"
4,scopus,"yamin, muh (57520111900); sarno, riyanarto (53...",enhancing machine translation: syntax and sema...,2024,this research aimed at constructing an effecti...,hybrid mt; machine translation; rbmt; semantic...,,
...,...,...,...,...,...,...,...,...
1066,acl-anthology,"[lohar, pintu, popovic, maja, way, andy]",building english-to-serbian machine translatio...,2019,this paper reports the results of the first ex...,,,low-resource
1067,acl-anthology,"[moryossef, amit, aharoni, roee, goldberg, yoav]",filling gender & number gaps in neural machine...,2019,when translating from a language that does not...,,,
1068,acl-anthology,"[habash, nizar, bouamor, houda, chung, christine]",automatic gender identification and reinflecti...,2019,the impressive progress in many natural langua...,,,
1069,acl-anthology,"[aiken, brad, kelly, jared, palmer, alexis, po...",sigmorphon 2019 task 2 system description pape...,2019,this paper presents the unt hilt+ling system f...,,,


# **Identify "Morphology" Search Terms**

In [None]:
search_words = ['morpholog', 'morphem', 'inflect', 'grammar based', 'grammar-based', 'fusional', 'agglutinat', 'polysynthetic']

def find_search_words(row, search_words):
  found_words = []
  for column in ['title', 'abstract', 'author keywords', 'index keywords']:
    for word in search_words:
      if word.lower() in row[column].lower() and word not in found_words:
        found_words.append(word)
  return ', '.join(found_words)

combined_df['morphology'] = combined_df.apply(lambda row: find_search_words(row, search_words), axis=1)
combined_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['morphology'] = combined_df.apply(lambda row: find_search_words(row, search_words), axis=1)


Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords,low resource,morphology
0,scopus,"behrooznia, soheila (59342262200); ansari, ebr...",enhancing turkish word segmentation: a focus o...,2024,this study addresses a challenge in morphologi...,,translation (languages); current; its efficien...,,"morphem, morpholog"
1,scopus,"abdel-salam, reem (57222078275)",rematchka at arabicnlu2024: evaluating large l...,2024,natural language understanding (nlu) plays a v...,,computational linguistics; natural language pr...,,morpholog
2,scopus,"jia, yangji (56272028100); la, maoji (59249059...",multifeature bilstm-crf model for tibetan pers...,2024,person name is an important component of tibet...,bilstm-crf model; component; named entity; tib...,character recognition; image processing; knowl...,,morpholog
3,scopus,"meyer, francois (57207620101); buys, jan (5690...",a systematic analysis of subwords and cross-li...,2024,multilingual modelling can improve machine tra...,,computational linguistics; translation (langua...,"low-resource, low resource",morpholog
4,scopus,"yamin, muh (57520111900); sarno, riyanarto (53...",enhancing machine translation: syntax and sema...,2024,this research aimed at constructing an effecti...,hybrid mt; machine translation; rbmt; semantic...,,,morpholog
...,...,...,...,...,...,...,...,...,...
1066,acl-anthology,"[lohar, pintu, popovic, maja, way, andy]",building english-to-serbian machine translatio...,2019,this paper reports the results of the first ex...,,,low-resource,morpholog
1067,acl-anthology,"[moryossef, amit, aharoni, roee, goldberg, yoav]",filling gender & number gaps in neural machine...,2019,when translating from a language that does not...,,,,morpholog
1068,acl-anthology,"[habash, nizar, bouamor, houda, chung, christine]",automatic gender identification and reinflecti...,2019,the impressive progress in many natural langua...,,,,"inflect, morpholog"
1069,acl-anthology,"[aiken, brad, kelly, jared, palmer, alexis, po...",sigmorphon 2019 task 2 system description pape...,2019,this paper presents the unt hilt+ling system f...,,,,"morpholog, inflect"


# **Identify Low Resource Language Classes**

In [None]:
classes_df = pd.read_csv('languages.csv', names=['language','class'])
classes_df['language'] = classes_df['language'].map(lambda x: unidecode(x).replace("'",'').replace('-', ' '))

print('\nlanguage counts per class:')
for i in range(6):
  print(list(classes_df['class']).count(i))
print(classes_df)


language counts per class:
2191
226
19
28
18
7
         language  class
0           kasim      0
1          mapoyo      0
2         yamdena      0
3       rikbaktsa      0
4     belorussian      0
...           ...    ...
2484       german      5
2485     japanese      5
2486       french      5
2487       arabic      5
2488     mandarin      5

[2489 rows x 2 columns]


In [None]:
import string

classes = dict(zip(classes_df['language'], classes_df['class']))

languages = []
for index, abstract in enumerate(combined_df['abstract'].tolist()):
  lang_str = ""

  punctuation = str.maketrans('', '', string.punctuation)
  clean_abstract = str(' ' + abstract.translate(punctuation).lower() + ' ').replace('-', ' ')

  for language, lr_class in classes.items():
    if ' ' + language.lower() + ' ' in clean_abstract:
      lang_str += f'{language} ({lr_class}), '
  languages.append((index, lang_str[:-2]))

languages_df = pd.DataFrame(languages, columns=['index', 'languages'])
languages_df

Unnamed: 0,index,languages
0,0,turkish (4)
1,1,arabic (5)
2,2,tibetan (1)
3,3,
4,4,"indonesian (3), english (5)"
...,...,...
1066,1066,"serbian (4), english (5)"
1067,1067,"hebrew (3), english (5)"
1068,1068,"english (5), arabic (5)"
1069,1069,


In [None]:
combined_df['index'] = combined_df.index
combined_languages_df = pd.merge(combined_df, languages_df, left_on=['index'], right_on= ['index'], how='left').drop('index', axis=1)
combined_languages_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_df['index'] = combined_df.index


Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords,low resource,morphology,languages
0,scopus,"behrooznia, soheila (59342262200); ansari, ebr...",enhancing turkish word segmentation: a focus o...,2024,this study addresses a challenge in morphologi...,,translation (languages); current; its efficien...,,"morphem, morpholog",turkish (4)
1,scopus,"abdel-salam, reem (57222078275)",rematchka at arabicnlu2024: evaluating large l...,2024,natural language understanding (nlu) plays a v...,,computational linguistics; natural language pr...,,morpholog,arabic (5)
2,scopus,"jia, yangji (56272028100); la, maoji (59249059...",multifeature bilstm-crf model for tibetan pers...,2024,person name is an important component of tibet...,bilstm-crf model; component; named entity; tib...,character recognition; image processing; knowl...,,morpholog,tibetan (1)
3,scopus,"meyer, francois (57207620101); buys, jan (5690...",a systematic analysis of subwords and cross-li...,2024,multilingual modelling can improve machine tra...,,computational linguistics; translation (langua...,"low-resource, low resource",morpholog,
4,scopus,"yamin, muh (57520111900); sarno, riyanarto (53...",enhancing machine translation: syntax and sema...,2024,this research aimed at constructing an effecti...,hybrid mt; machine translation; rbmt; semantic...,,,morpholog,"indonesian (3), english (5)"
...,...,...,...,...,...,...,...,...,...,...
1066,acl-anthology,"[lohar, pintu, popovic, maja, way, andy]",building english-to-serbian machine translatio...,2019,this paper reports the results of the first ex...,,,low-resource,morpholog,"serbian (4), english (5)"
1067,acl-anthology,"[moryossef, amit, aharoni, roee, goldberg, yoav]",filling gender & number gaps in neural machine...,2019,when translating from a language that does not...,,,,morpholog,"hebrew (3), english (5)"
1068,acl-anthology,"[habash, nizar, bouamor, houda, chung, christine]",automatic gender identification and reinflecti...,2019,the impressive progress in many natural langua...,,,,"inflect, morpholog","english (5), arabic (5)"
1069,acl-anthology,"[aiken, brad, kelly, jared, palmer, alexis, po...",sigmorphon 2019 task 2 system description pape...,2019,this paper presents the unt hilt+ling system f...,,,,"morpholog, inflect",


# **Keep Only "Low Resource" or Class 0-2 Language Papers**

In [None]:
filtered_df = combined_languages_df[combined_languages_df['languages'].str.contains("(0)") | combined_languages_df['languages'].str.contains("(1)") | combined_languages_df['languages'].str.contains("(2)") | combined_languages_df['low resource'].str.len() != 0]
filtered_df.to_csv('dataset_updated.csv', index=False)
print('dataset papers count:', len(filtered_df))
filtered_df

dataset papers count: 417


  filtered_df = combined_languages_df[combined_languages_df['languages'].str.contains("(0)") | combined_languages_df['languages'].str.contains("(1)") | combined_languages_df['languages'].str.contains("(2)") | combined_languages_df['low resource'].str.len() != 0]


Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords,low resource,morphology,languages
2,scopus,"jia, yangji (56272028100); la, maoji (59249059...",multifeature bilstm-crf model for tibetan pers...,2024,person name is an important component of tibet...,bilstm-crf model; component; named entity; tib...,character recognition; image processing; knowl...,,morpholog,tibetan (1)
3,scopus,"meyer, francois (57207620101); buys, jan (5690...",a systematic analysis of subwords and cross-li...,2024,multilingual modelling can improve machine tra...,,computational linguistics; translation (langua...,"low-resource, low resource",morpholog,
5,scopus,"akhmetov, iskander (57219837533); aubakirov, s...",machine learning methods for kazakh morphology...,2024,"kazakh is an agglutinative language, where the...",agglutinative languages; deep learning; kazakh...,computational linguistics; computer aided lang...,low-resource,"morpholog, morphem, agglutinat",kazakh (3)
8,scopus,"nouzri, sana (35772848100); el fatimi, meryem ...",beyond chatbots: enhancing luxembourgish langu...,2025,the intersection of artificial intelligence (a...,bpmn; language learning; llms; mas; personaliz...,adversarial machine learning; computer aided l...,low-resource,,luxembourgish (1)
10,scopus,"lucas, agustin (58672141000); baladon, alexis ...",grammar-based data augmentation for low-resour...,2024,one of the main problems low-resource language...,,computational linguistics; computer aided lang...,"low-resource, low resource",grammar-based,"guarani (1), spanish (5)"
...,...,...,...,...,...,...,...,...,...,...
1061,acl-anthology,"[ortega, john, pillaipakkamnatt, krishnan]",using morphemes from agglutinative languages l...,2018,,,,low-resource,"morphem, agglutinat",
1063,acl-anthology,"[teferra abate, solomon, melese, michael, yifi...",parallel corpora for bi-directional statistica...,2018,"in this paper, we describe the development of ...",,,,morpholog,"wolaytta (0), amharic (2)"
1064,acl-anthology,"[micher, jeffrey]",using the nunavut hansard data for experiments...,2018,inuktitut is a polysynthetic language spoken i...,,,,"morpholog, morphem, polysynthetic","inuktitut (1), english (5)"
1065,acl-anthology,"[mager, manuel, mager, elisabeth, medina-urrea...",lost in translation: analysis of information l...,2018,machine translation from polysynthetic to fusi...,,,low-resource,"fusional, polysynthetic, morphem","nahuatl (1), spanish (5)"


In [None]:
old_dataset_df = pd.read_csv('dataset.csv')
old_dataset_df = old_dataset_df[['title', 'languages']].fillna('')
old_dataset_df.rename(columns={'languages': 'old_languages'}, inplace=True)
old_dataset_df['title'] = old_dataset_df['title'].map(str.lower)
old_dataset_df

# old_new_df = pd.merge(filtered_df, old_dataset_df, left_on=['title'], right_on= ['title'], how='left').drop('index', axis=1)
# combined_languages_df


Unnamed: 0,title,old_languages
0,multifeature bilstm-crf model for tibetan pers...,tibetan (1)
1,a systematic analysis of subwords and cross-li...,
2,machine learning methods for kazakh morphology...,kazakh (3)
3,beyond chatbots: enhancing luxembourgish langu...,luxembourgish (1)
4,grammar-based data augmentation for low-resour...,"guarani (1), spanish (5)"
...,...,...
506,parallel corpora for bi-directional statistica...,"wolaytta (0), amharic (2)"
507,using the nunavut hansard data for experiments...,inuktitut (1)
508,lost in translation: analysis of information l...,"nahuatl (1), spanish (5)"
509,building english-to-serbian machine translatio...,serbian (4)


In [None]:
list = ['interlingua', 'turkic', 'wu', 'lua', 'aka', 'apache', 'chang', 'ga', 'mono', 'sama', 'mali', 'dla', 'gan', 'bare', 'broken']
print(sorted([element.capitalize() for element in list]))

['Aka', 'Apache', 'Bare', 'Broken', 'Chang', 'Dla', 'Ga', 'Gan', 'Interlingua', 'Lua', 'Mali', 'Mono', 'Sama', 'Turkic', 'Wu']


In [None]:
dataset_updated_df = pd.read_csv('dataset-updated.csv').fillna('')
dataset_updated_df

Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords,low resource,morphology,languages,...,languages old,status,reviewer 1,reviewer 1 verdict,reviewer 1 reason,reviewer 1 notes,reviewer 2,reviewer 2 verdict,reviewer 2 reason,reviewer 2 notes
0,scopus,"jia, yangji (56272028100); la, maoji (59249059...",multifeature bilstm-crf model for tibetan pers...,2024,person name is an important component of tibet...,bilstm-crf model; component; named entity; tib...,character recognition; image processing; knowl...,,morpholog,tibetan (1),...,tibetan (1),FALSE,Rusty,FALSE,Not Translation,,Murilo,FALSE,,
1,scopus,"meyer, francois (57207620101); buys, jan (5690...",a systematic analysis of subwords and cross-li...,2024,multilingual modelling can improve machine tra...,,computational linguistics; translation (langua...,"low-resource, low resource",morpholog,,...,,TRUE,Agustin,TRUE,,,Ahmad,TRUE,,
2,scopus,"akhmetov, iskander (57219837533); aubakirov, s...",machine learning methods for kazakh morphology...,2024,"kazakh is an agglutinative language, where the...",agglutinative languages; deep learning; kazakh...,computational linguistics; computer aided lang...,low-resource,"morpholog, morphem, agglutinat",kazakh (3),...,kazakh (3),FALSE,Rusty,FALSE,Not Low-Resource,,Murilo,FALSE,Not Translation,it mentions machine translation as one of seve...
3,scopus,"nouzri, sana (35772848100); el fatimi, meryem ...",beyond chatbots: enhancing luxembourgish langu...,2025,the intersection of artificial intelligence (a...,bpmn; language learning; llms; mas; personaliz...,adversarial machine learning; computer aided l...,low-resource,,luxembourgish (1),...,luxembourgish (1),FALSE,Agustin,FALSE,Not Translation,,Ahmad,FALSE,,
4,scopus,"lucas, agustin (58672141000); baladon, alexis ...",grammar-based data augmentation for low-resour...,2024,one of the main problems low resource language...,,computational linguistics; computer aided lang...,"low-resource, low resource",grammar-based,"guarani (1), spanish (5)",...,"guarani (1), spanish (5)",TRUE,Agustin,TRUE,,,Ahmad,TRUE,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426,acl-anthology,"[ortega, john, pillaipakkamnatt, krishnan]",using morphemes from agglutinative languages l...,2018,,,,low-resource,"morphem, agglutinat",,...,,TRUE,Rusty,TRUE,,,Murilo,TRUE,,
427,acl-anthology,"[teferra abate, solomon, melese, michael, yifi...",parallel corpora for bi-directional statistica...,2018,"in this paper, we describe the development of ...",,,,morpholog,"wolaytta (0), oromo (1), amharic (2), tigrigna...",...,"wolaytta (0), amharic (2)",FALSE,Rusty,FALSE,Duplicate,"tigrigna = tigrinya (2), afaan oromo = oromo (...",Murilo,FALSE,Duplicate,duplicate from previous abstract
428,acl-anthology,"[micher, jeffrey]",using the nunavut hansard data for experiments...,2018,inuktitut is a polysynthetic language spoken i...,,,,"morpholog, morphem, polysynthetic","inuktitut (1), english (5)",...,inuktitut (1),TRUE,Rusty,TRUE,,,Murilo,TRUE,,
429,acl-anthology,"[mager, manuel, mager, elisabeth, medina-urrea...",lost in translation: analysis of information l...,2018,machine translation from polysynthetic to fusi...,,,low-resource,"fusional, polysynthetic, morphem","nahuatl (1), spanish (5), wixarika (0), yorem ...",...,"nahuatl (1), spanish (5)",TRUE,Rusty,TRUE,,,Murilo,TRUE,,


In [None]:
def search(title_abstract_keywords, words, category, categories):
  for word in words:
    if word.lower() in title_abstract_keywords and word not in categories:
      categories.add(category)
  return categories


def find_search_words(row):
  for column in ['title', 'abstract', 'author keywords', 'index keywords']:
    title_abstract_keywords = row[column].lower()
    categories = search(title_abstract_keywords, ['analyze', 'analyser'], 'morphological analyzer', set())
    categories = search(title_abstract_keywords, ['subword', 'sub-word', 'sub word', 'segment'], 'subword modeling/segmentation', categories)
    categories = search(title_abstract_keywords, ['grammar'], 'formal grammar', categories)
    categories = search(title_abstract_keywords, ['corpus', 'new dataset'], 'corpus creation', categories)
    categories = search(title_abstract_keywords, ['augment'], 'data augmentation', categories)
    categories = search(title_abstract_keywords, ['sign language'], 'sign language', categories)
  return ', '.join(categories)

dataset_updated_df['methodology'] = dataset_updated_df.apply(lambda row: find_search_words(row), axis=1)
dataset_updated_df

Unnamed: 0,database,authors,title,year,abstract,author keywords,index keywords,low resource,morphology,languages,...,status,reviewer 1,reviewer 1 verdict,reviewer 1 reason,reviewer 1 notes,reviewer 2,reviewer 2 verdict,reviewer 2 reason,reviewer 2 notes,methodology
0,scopus,"jia, yangji (56272028100); la, maoji (59249059...",multifeature bilstm-crf model for tibetan pers...,2024,person name is an important component of tibet...,bilstm-crf model; component; named entity; tib...,character recognition; image processing; knowl...,,morpholog,tibetan (1),...,FALSE,Rusty,FALSE,Not Translation,,Murilo,FALSE,,,
1,scopus,"meyer, francois (57207620101); buys, jan (5690...",a systematic analysis of subwords and cross-li...,2024,multilingual modelling can improve machine tra...,,computational linguistics; translation (langua...,"low-resource, low resource",morpholog,,...,TRUE,Agustin,TRUE,,,Ahmad,TRUE,,,subword modeling/segmentation
2,scopus,"akhmetov, iskander (57219837533); aubakirov, s...",machine learning methods for kazakh morphology...,2024,"kazakh is an agglutinative language, where the...",agglutinative languages; deep learning; kazakh...,computational linguistics; computer aided lang...,low-resource,"morpholog, morphem, agglutinat",kazakh (3),...,FALSE,Rusty,FALSE,Not Low-Resource,,Murilo,FALSE,Not Translation,it mentions machine translation as one of seve...,
3,scopus,"nouzri, sana (35772848100); el fatimi, meryem ...",beyond chatbots: enhancing luxembourgish langu...,2025,the intersection of artificial intelligence (a...,bpmn; language learning; llms; mas; personaliz...,adversarial machine learning; computer aided l...,low-resource,,luxembourgish (1),...,FALSE,Agustin,FALSE,Not Translation,,Ahmad,FALSE,,,data augmentation
4,scopus,"lucas, agustin (58672141000); baladon, alexis ...",grammar-based data augmentation for low-resour...,2024,one of the main problems low resource language...,,computational linguistics; computer aided lang...,"low-resource, low resource",grammar-based,"guarani (1), spanish (5)",...,TRUE,Agustin,TRUE,,,Ahmad,TRUE,,,data augmentation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
426,acl-anthology,"[ortega, john, pillaipakkamnatt, krishnan]",using morphemes from agglutinative languages l...,2018,,,,low-resource,"morphem, agglutinat",,...,TRUE,Rusty,TRUE,,,Murilo,TRUE,,,
427,acl-anthology,"[teferra abate, solomon, melese, michael, yifi...",parallel corpora for bi-directional statistica...,2018,"in this paper, we describe the development of ...",,,,morpholog,"wolaytta (0), oromo (1), amharic (2), tigrigna...",...,FALSE,Rusty,FALSE,Duplicate,"tigrigna = tigrinya (2), afaan oromo = oromo (...",Murilo,FALSE,Duplicate,duplicate from previous abstract,
428,acl-anthology,"[micher, jeffrey]",using the nunavut hansard data for experiments...,2018,inuktitut is a polysynthetic language spoken i...,,,,"morpholog, morphem, polysynthetic","inuktitut (1), english (5)",...,TRUE,Rusty,TRUE,,,Murilo,TRUE,,,
429,acl-anthology,"[mager, manuel, mager, elisabeth, medina-urrea...",lost in translation: analysis of information l...,2018,machine translation from polysynthetic to fusi...,,,low-resource,"fusional, polysynthetic, morphem","nahuatl (1), spanish (5), wixarika (0), yorem ...",...,TRUE,Rusty,TRUE,,,Murilo,TRUE,,,


In [None]:
dataset_updated_df['methodology'][dataset_updated_df['methodology'] != ''].count()

np.int64(111)

In [None]:
methodology_df = dataset_updated_df[['title', 'methodology']].sort_values(by='title')
methodology_df
methodology_df.to_csv('methodology.csv', index=False)