In [12]:
# !pip install nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/mateo1/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/mateo1/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [60]:
import pandas as pd

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


# Data

In [111]:
data = pd.read_csv("data/ACL_data.csv")
data.head()

Unnamed: 0,YEAR,TITLE,ABSTRACT,AUTHOR,PUBLISHER,BOOKTITLE,CATEGORY
0,2022,A Systematic Survey of Text Worlds as Embodied...,Text Worlds are virtual environments for embod...,"Jansen, Peter",Association for Computational Linguistics,Proceedings of the 3rd Wordplay: When Language...,INPROCEEDINGS
1,2022,A Minimal Computational Improviser Based on Or...,A prototype system for playing a minimal impro...,"Montfort, Nick and",Association for Computational Linguistics,Proceedings of the 3rd Wordplay: When Language...,INPROCEEDINGS
2,2022,Craft an Iron Sword: Dynamically Generating In...,Non-Player Characters (NPCs) significantly enh...,"Volum, Ryan and",Association for Computational Linguistics,Proceedings of the 3rd Wordplay: When Language...,INPROCEEDINGS
3,2022,A Sequence Modelling Approach to Question Answ...,Interactive Question Answering (IQA) requires ...,"Furman, Gregory and",Association for Computational Linguistics,Proceedings of the 3rd Wordplay: When Language...,INPROCEEDINGS
4,2022,Automatic Exploration of Textual Environments ...,The purpose of this extended abstract is to di...,"Teodorescu, Laetitia and",Association for Computational Linguistics,Proceedings of the 3rd Wordplay: When Language...,INPROCEEDINGS


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32352 entries, 0 to 32351
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   YEAR       32352 non-null  int64 
 1   TITLE      32352 non-null  object
 2   ABSTRACT   32246 non-null  object
 3   PUBLISHER  32352 non-null  object
 4   BOOKTITLE  32352 non-null  object
 5   CATEGORY   32352 non-null  object
dtypes: int64(1), object(5)
memory usage: 1.5+ MB


In [4]:
data.YEAR.value_counts()

2020    6574
2021    5774
2019    4315
2018    3363
2022    3054
2017    2495
2016    1778
2010     832
2014     819
2012     762
2008     735
2006     622
2004     201
2011     117
2015     107
2009     104
2002      88
2007      88
2005      88
2000      79
2003      64
2001      61
1998      51
1989      47
1993      43
1995      33
1997      31
1991      27
Name: YEAR, dtype: int64

In [12]:
data.PUBLISHER.value_counts()

Association for Computational Linguistics                                                 21004
European Language Resources Association (ELRA)                                             4147
ATALA                                                                                      1375
European Language Resources Association                                                    1104
The COLING 2016 Organizing Committee                                                        739
International Committee on Computational Linguistics                                        654
INCOMA Ltd.                                                                                 573
Association for Machine Translation in the Americas                                         384
International Committee for Computational Linguistics                                       307
Asian Federation of Natural Language Processing                                             253
Chinese Information Processing Society o

In [13]:
data.CATEGORY.value_counts()

INPROCEEDINGS    32352
Name: CATEGORY, dtype: int64

In [56]:
data.ABSTRACT[:3].values


array(['Text Worlds are virtual environments for embodied agents that, unlike 2D or 3D environments, are rendered exclusively using textual descriptions. These environments offer an alternative to higher-fidelity 3D environments due to their low barrier to entry, providing the ability to study semantics, compositional inference, and other high-level tasks with rich action spaces while controlling for perceptual input. This systematic survey outlines recent developments in tooling, environments, and agent modeling for Text Worlds, while examining recent trends in knowledge graphs, common sense reasoning, transfer learning of Text World performance to higher-fidelity environments, as well as near-term development targets that, once achieved, make Text Worlds an attractive general research paradigm for natural language processing.',
       "A prototype system for playing a minimal improvisational game with one or more human or computer players is discussed. The game, Chain Reaction, has p

## Cleaning and tokenize

In [94]:
# En este caso particular no hace falta limpiar los textos
def clean_text(text):
    if isinstance(text, pd.Series):
        return text.str.strip().str.lower().values
    elif isinstance(text, str):
        return text.strip().lower()

def tokenizer(text):
    #return [w for w in word_tokenize(text) if w.isalpha()] # si solo nos interesan palabras
    return word_tokenize(str(text))
    # if isinstance(text, str):
    #     return word_tokenize(text)
    # else:
    #     return text.apply(word_tokenize)



In [95]:
clean_text(data.ABSTRACT[0])

'text worlds are virtual environments for embodied agents that, unlike 2d or 3d environments, are rendered exclusively using textual descriptions. these environments offer an alternative to higher-fidelity 3d environments due to their low barrier to entry, providing the ability to study semantics, compositional inference, and other high-level tasks with rich action spaces while controlling for perceptual input. this systematic survey outlines recent developments in tooling, environments, and agent modeling for text worlds, while examining recent trends in knowledge graphs, common sense reasoning, transfer learning of text world performance to higher-fidelity environments, as well as near-term development targets that, once achieved, make text worlds an attractive general research paradigm for natural language processing.'

In [96]:
# word_tokenize(clean_text(data.ABSTRACT))
tokenizer(clean_text(data.ABSTRACT))[0:10]

['[',
 "'text",
 'worlds',
 'are',
 'virtual',
 'environments',
 'for',
 'embodied',
 'agents',
 'that']

In [19]:
print( type( clean_text( data.ABSTRACT[0]) ))
print( type(clean_text( data.ABSTRACT[:1]) ) )
print( type(clean_text( data.ABSTRACT) ) )

# tokenizer(clean_text(data.ABSTRACT[:1] ))#[:10]

<class 'str'>
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


## Normalizacion

In [104]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")


[(w,stemmer.stem(w)) for w in tokenizer(clean_text(data.ABSTRACT)) if w.isalpha()][:10]

[('worlds', 'world'),
 ('are', 'are'),
 ('virtual', 'virtual'),
 ('environments', 'environ'),
 ('for', 'for'),
 ('embodied', 'embodi'),
 ('agents', 'agent'),
 ('that', 'that'),
 ('unlike', 'unlik'),
 ('or', 'or')]

In [108]:
stemmer.stem(clean_text(data.ABSTRACT[0]))

'text worlds are virtual environments for embodied agents that, unlike 2d or 3d environments, are rendered exclusively using textual descriptions. these environments offer an alternative to higher-fidelity 3d environments due to their low barrier to entry, providing the ability to study semantics, compositional inference, and other high-level tasks with rich action spaces while controlling for perceptual input. this systematic survey outlines recent developments in tooling, environments, and agent modeling for text worlds, while examining recent trends in knowledge graphs, common sense reasoning, transfer learning of text world performance to higher-fidelity environments, as well as near-term development targets that, once achieved, make text worlds an attractive general research paradigm for natural language processing.'

In [109]:
clean_text(data.ABSTRACT[0])

'text worlds are virtual environments for embodied agents that, unlike 2d or 3d environments, are rendered exclusively using textual descriptions. these environments offer an alternative to higher-fidelity 3d environments due to their low barrier to entry, providing the ability to study semantics, compositional inference, and other high-level tasks with rich action spaces while controlling for perceptual input. this systematic survey outlines recent developments in tooling, environments, and agent modeling for text worlds, while examining recent trends in knowledge graphs, common sense reasoning, transfer learning of text world performance to higher-fidelity environments, as well as near-term development targets that, once achieved, make text worlds an attractive general research paradigm for natural language processing.'

## Stoplist

In [47]:
stoplist = stopwords.words("english")
stoplist[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [51]:
# tokenizo el stoplist con el mismo tokenizar que voy a usar en el corpus
stoplist_tokenized = []
for w in stoplist:
    stoplist_tokenized = stoplist_tokenized + tokenizer(w)
    
# Armo una lista sin repeticiones
stoplist_tokenized = list(set(stoplist_tokenized))

stoplist_tokenized[:10]

['doing',
 'but',
 'so',
 'until',
 'yours',
 'from',
 'again',
 'mustn',
 'will',
 'against']

In [69]:
[(w,stemmer.stem(w)) for w in clean_text(data.ABSTRACT.iloc[0]) if w.isalpha()][:10]

[('T', 't'),
 ('e', 'e'),
 ('x', 'x'),
 ('t', 't'),
 ('W', 'w'),
 ('o', 'o'),
 ('r', 'r'),
 ('l', 'l'),
 ('d', 'd'),
 ('s', 's')]

## Extraigo features

In [61]:
count_vect = CountVectorizer(preprocessor=clean_text,
                             tokenizer=tokenizer,
                             min_df=5,
                             stop_words=stoplist_tokenized)

data_clean = count_vect.fit_transform(data.ABSTRACT.values.astype('U')) # cuenta frecuencia de tokens y define el diccionario
# X_test = count_vect.transform(X_test_text) # cuenta frecuencia de tokens existentes en el diccionario
data_clean

<32352x19342 sparse matrix of type '<class 'numpy.int64'>'
	with 2334801 stored elements in Compressed Sparse Row format>

Consultar sobre diferencia entre fit_transform y transform

In [63]:
print("tamaño de la matriz:",32352*19342)
print("porcentaje de elementos distintos de cero: %",round(100*2334801/(32352*19342),2))

tamaño de la matriz: 625752384
porcentaje de elementos distintos de cero: % 0.37


In [64]:
count_vect.get_feature_names()[:20]



['!',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 '(',
 ')',
 '*',
 '+',
 '+0.8',
 '+0.9',
 '+1',
 '+1.0',
 '+1.2',
 '+1.3',
 '+1.7',
 '+2.1',
 '+5']