In [17]:
import json
from os.path import join, dirname, abspath

import numpy as np
import pandas  as pd
import spacy
from spacy.lang.pt.stop_words import STOP_WORDS
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix

In [3]:
def json_to_pd(r_path):
    dic = json.load(open(r_path, 'r'))
    cols = list(dic['train'][0])+['split']
    
    data_train = np.array([np.array(list(i.values())+['train']) for i in dic['train']])
    data_test = np.array([np.array(list(i.values())+['test']) for i in dic['test']])
    
    ind_train = [i['id'] for i in dic['train']]
    ind_test = [i['id'] for i in dic['test']]
    
    json_df_train = pd.DataFrame(data=data_train, index=ind_train, columns=cols)
    json_df_train['risco'] = pd.to_numeric(json_df_train['risco'])
    json_df_train['date'] = pd.to_datetime(json_df_train['date'])
    json_df_train = json_df_train.filter(['txt', 'risco', 'split'])
    
    json_df_test = pd.DataFrame(data=data_test, index=ind_test, columns=cols)
    json_df_test['risco'] = pd.to_numeric(json_df_test['risco'])
    json_df_test['date'] = pd.to_datetime(json_df_test['date'])
    json_df_test = json_df_test.filter(['txt', 'risco', 'split'])
    return json_df_train, json_df_test
# Entrada: Caminho até o arquivo e nome do arquivo
# Saída: hdf/dataframe com vetores de frequência, risco e split
# 

In [4]:
def text_to_tfidf_vectors(filename, path_to_folder= '/media/training/crossvalR/'):
    """ Receives a filename for json file and a path to folder.
        Returns a dataframe with text vectors instead of
        If no path_to_folder is given, the default folder is inside
        resources/data.
    """
    if not path_to_folder:
        path_to_folder = join(abspath(dirname(__file__)), 'resources/data/')

    VECTOR_MODEL_NAME = "pt_core_news_sm"
    NLP_SPACY = spacy.load(VECTOR_MODEL_NAME)
    TARGET_VARIABLE = "RISCO"
    TEXT_VARIABLE = "TXT"

    path_to_file = path_to_folder + filename + ".json"

    data_df_train, data_df_test = json_to_pd(path_to_file)

    ''' Create the pipeline 'sentencizer' component '''
    sentencizer = NLP_SPACY.create_pipe('sentencizer')
    try:
        ''' We then add the component to the pipeline if we hadn't done before '''
        NLP_SPACY.add_pipe(sentencizer, before='parser')
    except ValueError:
        print("Pipe already present.")

    for data_df in [data_df_train, data_df_test]:
        # Renaming the columns
        # Let's start uppercasing all column names and target variable values
        data_df.columns = map(lambda x: str(x).upper(), data_df.columns)
        data_df[TARGET_VARIABLE] = data_df[TARGET_VARIABLE].apply(
            lambda x: str(x))

        # print(data_df.head())

        # Removing ponctuation and stopwords
        # As we can see, we have a lot of tokens from text variable being
        # ponctuations or words that don't have by themselves much meaning.
        # We're going to load a built-in stopwords list to remove these
        # unnecessary tokens.
        stopwords_set = set(STOP_WORDS).union(
            set(stopwords.words('portuguese'))).union(
                set(['anos', 'ano', 'dia', 'dias', 'nº', 'n°']))

        # Removing HTML
        data_df['TXT'] = data_df['TXT'].str.replace(r'<.*?>', '')

        # Lemmatizing and stemming
        # print("This is the stopword list: ", sorted(list(stopwords_set)))

        ''' Not all variables are being undestood as strings so we have to force it'''
        preprocessed_text_data = data_df[TEXT_VARIABLE].to_list()


        # print(NLP_SPACY.pipe_names)

        tokenized_data = []
        semantics_data = []
        lemmatized_doc = []
        normalized_doc = []
        raw_doc = []
        for row in preprocessed_text_data:
            doc = NLP_SPACY(row)
            preprocessed_doc = [
                token for token in doc
                if token.is_alpha and token.norm_ not in stopwords_set]
            tokenized_data.append(preprocessed_doc)
            raw_doc.append(" ".join([word.text for word in preprocessed_doc]))
            lemmatized_doc.append(
                " ".join([word.lemma_ for word in preprocessed_doc]))
            normalized_doc.append(
                " ".join([word.norm_ for word in preprocessed_doc]))

        data_df['RAW_DOC'] = raw_doc
        data_df['NORMALIZED_DOC'] = normalized_doc
        data_df['LEMMATIZED_DOC'] = lemmatized_doc

    #     print(data_df.head())

        # Entity recognition and filtering
        # Some parts of speech may mislead the model associating classes
        # to certain entities that are not really related to the categories.
        processed_tokenized_data = []
        processed_doc_text = []
        entities_obs = []
        entity_unwanted_types = set(['PER', 'ORG'])

        for doc in tokenized_data:
            entities_text = ""
            processed_doc = []
            for token in doc:
                if not token.ent_type_:
                    processed_doc.append(token)
                elif token.ent_type_ not in entity_unwanted_types:
                    processed_doc.append(token)
                    entities_obs.append((token.text, token.ent_type_))

            processed_tokenized_data.append(processed_doc)
            processed_doc_text.append(
                " ".join([word.norm_ for word in processed_doc]))

        ''' Processing text on entity level'''
        data_df['PROCESSED_DOC'] = processed_doc_text
        
        # print(data_df.head())

        # Now we're going to remove POS,
        # only allowing proper nouns, nouns, adjectives, adverbs
        # and verb to present in our text variable.

        allowed_pos_set = set(["PROPN", "NOUN", "ADV", "ADJ", "VERB"])

        processed_doc = []
        filtered_token_obs = []
        for doc in processed_tokenized_data:
            doc_tokens = [word for word in doc if str(word.pos_) in allowed_pos_set]
            filtered_token_obs.append(doc_tokens)
            processed_doc.append(" ".join(token.norm_ for token in doc_tokens))

        data_df['PROCESSED_DOC'] = processed_doc
        data_df['TOKENS'] = filtered_token_obs
        # print(data_df.head()) 

        # Removing extra spaces originated from the removal of tokens
        space_pattern = r'\s\s+'
        data_df['PROCESSED_DOC'] = data_df['PROCESSED_DOC'].str.replace(space_pattern, " ").str.strip()
        data_df = data_df
        data_df = data_df.drop(columns=['TOKENS']).dropna()
        data_df[TARGET_VARIABLE] = data_df[TARGET_VARIABLE].apply(lambda x: str(x))

        # Removing accents and symbols
        data_df['PROCESSED_DOC'] = data_df['PROCESSED_DOC'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

        # print(data_df.info())
        ''' Best parameter using GridSearch (CV score=0.535): 
        {'tfidf__norm': 'l2', 'tfidf__smooth_idf': False, 'tfidf__sublinear_tf': False, 'tfidf__use_idf': True,
        'vect__max_df': 0.2, 'vect__max_features': None, 'vect__min_df': 0.0006, 'vect__ngram_range': (1, 3)}
        Those were obtained on the next code block.
        '''

    tfidf_transformer = TfidfVectorizer(
        norm='l2', use_idf=True, sublinear_tf=False)

    ''' Then use those count vectors to generate frequency vectors '''
    frequency_vectors_train = tfidf_transformer.fit_transform(data_df_train['PROCESSED_DOC'])
    frequency_vectors_test = tfidf_transformer.transform(data_df_test['PROCESSED_DOC'])
    
    return frequency_vectors_train, frequency_vectors_test


In [20]:
def save_sparse_csr(filename, array):
    np.savez(filename, data=array.data, indices=array.indices,
             indptr=array.indptr, shape=array.shape)


In [6]:
tfidf = text_to_tfidf_vectors('dic_raw_0_0')

CPU times: user 6min 33s, sys: 1.3 s, total: 6min 35s
Wall time: 6min 35s


In [21]:
path_to_file = ""
save_sparse_csr("tfidf_vectors_0_0_train", tfidf[0])
save_sparse_csr("tfidf_vectors_0_0_test", tfidf[1])