# Clean and feature extraction v3

## Clean text, extract stylometric features and create a new dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('../data/corpus_spanish.csv')

In [3]:
df.head()

Unnamed: 0,Id,Category,Topic,Source,Headline,Text,Link
0,641,True,Entertainment,Caras,Sofía Castro y Alejandro Peña Pretelini: una i...,Sofía Castro y Alejandro Peña Pretelini: una i...,https://www.caras.com.mx/sofia-castro-alejandr...
1,6,True,Education,Heraldo,Un paso más cerca de hacer los exámenes 'online',Un paso más cerca de hacer los exámenes 'onlin...,https://www.heraldo.es/noticias/suplementos/he...
2,141,True,Science,HUFFPOST,Esto es lo que los científicos realmente piens...,Esto es lo que los científicos realmente piens...,https://www.huffingtonpost.com/entry/scientist...
3,394,True,Politics,El financiero,Inicia impresión de boletas para elección pres...,Inicia impresión de boletas para elección pres...,http://www.elfinanciero.com.mx/elecciones-2018...
4,139,True,Sport,FIFA,A *NUMBER* día del Mundial,A *NUMBER* día del Mundial\nFIFA.com sigue la ...,https://es.fifa.com/worldcup/news/a-1-dia-del-...


In [4]:
df.shape

(971, 7)

In [5]:
df.dtypes

Id           int64
Category    object
Topic       object
Source      object
Headline    object
Text        object
Link        object
dtype: object

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 971 entries, 0 to 970
Data columns (total 7 columns):
Id          971 non-null int64
Category    971 non-null object
Topic       971 non-null object
Source      971 non-null object
Headline    971 non-null object
Text        971 non-null object
Link        971 non-null object
dtypes: int64(1), object(6)
memory usage: 53.2+ KB


## We are using `spacy`: The NLP *Ruby on Rails* 

[spacy](http://www.spacy.io/) is a library of natural language processing, robust, fast, easy to install and to use. It can be used with other NLP and Deep Learning Libraries.

With its pre-trained models in spanish language, we can operate the typical NLP jobs: Sentences segmentation, tokenization, POS tag, etc...

We are going to use the `es_core_news_lg` pre-trained model to make pos tagging:

In [7]:
import spacy

In [8]:
# we load the pre trained model in spanish language

nlp = spacy.load('es_core_news_lg')

In [19]:
text = df['Text'].iloc[1]
print(text)

Un paso más cerca de hacer los exámenes 'online'
Cerca de *NUMBER* universitarios de seis universidades europeas participan este cuatrimestre en la última fase de pruebas de un programa informático que permitirá certificar la identidad y autoría de los estudiantes cuando realizan actividades 'online' como exámenes, trabajos u otras pruebas.
El proyecto se encuentra en la tercera y última fase de pruebas piloto, en la que se pone a prueba la versión "final" del 'software', que incluye herramientas de reconocimiento facial, de voz y otras que capturan patrones de escritura, detectan plagio y analizan el lenguaje y el estilo de redacción, según sus impulsores.
En dicha fase hay una primera prueba en la que están participando *NUMBER* estudiantes y en la que la Universitat Oberta de Catalunya (UOC) participa con *NUMBER* estudiantes, *NUMBER* profesores y *NUMBER* docentes; y una segunda prueba en la que participarán entre *NUMBER* y *NUMBER* estudiantes. "El balance de las dos pruebas [an

## Clean text for spanish

In [21]:
def text_clean(text):
    
    text = text.replace(r"http\S+", "")
    text = text.replace(r"http", "")
    text = text.replace(r"@\S+", "")
    text = text.replace(r"(?<!\n)\n(?!\n)", " ")
    text = text.lower()
    
    # text processing
    doc = nlp(text)
    
    return doc

In [22]:
doc = text_clean(text)

### We can easily iterate over the sentences list and scroll through the tokens to access their morpho-syntactic information:

In [31]:
import nltk

list_tokens = []
list_pos = []
list_tag = []

for sentence in doc.sents:
    for token in sentence:
        list_tokens.append(token.text)
        list_pos.append(token.pos_)
        list_tag.append(token.tag_)

nltk.Counter(list_tag)

Counter({'DET__Definite=Ind|Gender=Masc|Number=Sing|PronType=Art': 4,
         'NOUN__Gender=Masc|Number=Sing': 21,
         'ADV__Degree=Cmp': 1,
         'ADV': 5,
         'ADP__AdpType=Prep': 56,
         'VERB__VerbForm=Inf': 2,
         'DET__Definite=Def|Gender=Masc|Number=Plur|PronType=Art': 5,
         'NOUN__Gender=Masc|Number=Plur': 13,
         'SYM': 9,
         'PROPN': 81,
         '_SP': 6,
         'PUNCT': 2,
         'NOUN': 11,
         'NUM__NumForm=Digit|NumType=Card': 8,
         'ADJ__Gender=Masc|Number=Plur': 2,
         'NUM__Number=Plur|NumType=Card': 3,
         'NOUN__Gender=Fem|Number=Plur': 14,
         'ADJ__Gender=Fem|Number=Plur': 2,
         'VERB__Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin': 7,
         'DET__Gender=Masc|Number=Sing|PronType=Dem': 1,
         'DET__Definite=Def|Gender=Fem|Number=Sing|PronType=Art': 23,
         'ADJ__Gender=Fem|Number=Sing|NumType=Ord': 8,
         'NOUN__Gender=Fem|Number=Sing': 19,
         'ADJ__Gender=

In [34]:
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords  
from nltk import word_tokenize, sent_tokenize  
from string import punctuation

from lexical_diversity import lex_div as ld

text = text.replace(r"http\S+", "")
text = text.replace(r"http", "")
text = text.replace(r"@\S+", "")
text = text.replace(r"(?<!\n)\n(?!\n)", " ")
text = text.lower()

doc = nlp(text)

list_tokens = []
list_tag = []
list_pos = []
n_sents = 0

for sentence in doc.sents:
    n_sents += 1
    for token in sentence:
        list_tokens.append(token.text)
        list_tags.append(token.pos_)
        
n_tag = nltk.Counter(list_tag)
n_pos = nltk.Counter(list_pos)
fdist = FreqDist(list_tokens)
        
# complexity features
n_words = len(list_tokens)
avg_word_sentences = (float(n_words) / n_sents)
word_size = sum(len(word) for word in list_tokens) / n_words
unique_words = (len(fdist.hapaxes()) / n_words) * 100
ttr = ld.ttr(list_tokens) * 100
mltd = ld.mtld(list_tokens)

# lexical features
n_quotes = n_tag['PUNCT__PunctType=Quot']
quotes_ratio = (n_quotes / n_words) * 100
propn_ratio = (n_pos['PROPN'] / n_words) * 100 
noun_ratio = (n_pos['NOUN'] / n_words) * 100 
adp_ratio = (n_pos['ADP'] / n_words) * 100
det_ratio = (n_pos['DET'] / n_words) * 100
punct_ratio = (n_pos['PUNCT'] / n_words) * 100 
pron_ratio = (n_pos['PRON'] / n_words) * 100
verb_ratio = (n_pos['VERB'] / n_words) * 100
adv_ratio = (n_pos['ADV'] / n_words) * 100
sym_ratio = (n_tag['SYM'] / n_words) * 100

print(n_words, n_sents, avg_word_sentences, word_size, unique_words, ttr, mltd, n_quotes, quotes_ratio, propn_ratio, noun_ratio, adp_ratio, det_ratio, punct_ratio, 
      pron_ratio, verb_ratio, adv_ratio, sym_ratio)

486 9 54.0 4.255144032921811 32.71604938271605 44.23868312757202 41.28313642062075 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0


## Apply it to the full corpus with iterrows()

In [37]:
%%time

import itertools
import pandas as pd
import numpy as np

import nltk
import spacy
from nltk import FreqDist
from sklearn.preprocessing import LabelEncoder
from lexical_diversity import lex_div as ld
nlp = spacy.load('es_core_news_lg')

df = pd.read_csv('../data/corpus_spanish.csv')

labelencoder = LabelEncoder()
df['Label'] = labelencoder.fit_transform(df['Category'])

# empty lists and df
df_features = pd.DataFrame()
list_text = []
list_nsentences = []
list_nwords = []
list_words_sent = []
list_word_size = []
list_unique_words = []
list_ttr = []
list_mltd = []
list_nquotes = []
list_quotes_ratio = []
list_propn_ratio = [] 
list_noun_ratio = []
list_adp_ratio = []
list_det_ratio = []
list_punct_ratio = []
list_pron_ratio = []
list_verb_ratio = []
list_adv_ratio = []
list_sym_ratio = []

# df iteration
for n, row in df.iterrows():
    ## text content##   
    text = df['Text'].iloc[n]  
    text = text.replace(r"http\S+", "")
    text = text.replace(r"http", "")
    text = text.replace(r"@\S+", "")
    text = text.replace(r"(?<!\n)\n(?!\n)", " ")
    text = text.lower()
    doc = nlp(text)

    list_tokens = []
    list_pos = []
    list_tag = []
    n_sents = 0

    for sentence in doc.sents:
        n_sents += 1
        for token in sentence:
            list_tokens.append(token.text)
            list_pos.append(token.pos_)
            list_tag.append(token.tag_)
            
    n_pos = nltk.Counter(list_pos)
    n_tag = nltk.Counter(list_tag)
    fdist = FreqDist(list_tokens)

    # complexity features
    n_words = len(list_tokens)
    avg_word_sentences = (float(n_words) / n_sents)
    word_size = sum(len(word) for word in list_tokens) / n_words
    unique_words = (len(fdist.hapaxes()) / n_words) * 100
    ttr = ld.ttr(list_tokens) * 100
    mltd = ld.mtld(list_tokens)

    # lexical features
    n_quotes = n_tag['PUNCT__PunctType=Quot']
    quotes_ratio = (n_quotes / n_words) * 100
    propn_ratio = (n_pos['PROPN'] / n_words) * 100 
    noun_ratio = (n_pos['NOUN'] / n_words) * 100 
    adp_ratio = (n_pos['ADP'] / n_words) * 100
    det_ratio = (n_pos['DET'] / n_words) * 100
    punct_ratio = (n_pos['PUNCT'] / n_words) * 100 
    pron_ratio = (n_pos['PRON'] / n_words) * 100
    verb_ratio = (n_pos['VERB'] / n_words) * 100
    adv_ratio = (n_pos['ADV'] / n_words) * 100
    sym_ratio = (n_tag['SYM'] / n_words) * 100
    
    # appending on lists
    list_text.append(text)
    list_nsentences.append(n_sents)
    list_nwords.append(n_words)
    list_words_sent.append(avg_word_sentences)
    list_word_size.append(word_size)
    list_unique_words.append(unique_words)
    list_ttr.append(ttr)
    list_mltd.append(mltd)
    list_nquotes.append(n_quotes)
    list_quotes_ratio.append(quotes_ratio)
    list_propn_ratio.append(propn_ratio)
    list_noun_ratio.append(noun_ratio)
    list_adp_ratio.append(adp_ratio)
    list_det_ratio.append(det_ratio)
    list_punct_ratio.append(punct_ratio)
    list_pron_ratio.append(pron_ratio)
    list_verb_ratio.append(verb_ratio)
    list_adv_ratio.append(adv_ratio)
    list_sym_ratio.append(sym_ratio)
    
# dataframe
df_features['text'] = list_text
df_features['headline'] = list_headline
df_features['n_sents'] = list_nsentences
df_features['n_words'] = list_nwords
df_features['avg_words_sents'] = list_words_sent
df_features['word_size'] = list_word_size
df_features['unique_words'] = list_unique_words
df_features['ttr'] = list_ttr
df_features['mltd'] = list_mltd
df_features['n_quotes'] = list_nquotes
df_features['quotes_ratio'] = list_quotes_ratio
df_features['propn_ratio'] = list_propn_ratio
df_features['noun_ratio'] = list_noun_ratio
df_features['adp_ratio'] = list_adp_ratio
df_features['det_ratio'] = list_det_ratio
df_features['punct_ratio'] = list_punct_ratio
df_features['pron_ratio'] = list_pron_ratio
df_features['verb_ratio'] = list_verb_ratio
df_features['adv_ratio'] = list_adv_ratio
df_features['sym_ratio'] = list_sym_ratio
df_features['label'] = df['Label']

df_features.to_csv('../data/spanish_corpus_features_v3.csv', encoding = 'utf-8', index = False)

NameError: name 'list_headline' is not defined

In [36]:
df_features.head(10)