In [2]:
import spacy
from spacy.lang.nl.stop_words import STOP_WORDS
from string import punctuation
import re
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from collections import Counter

In [3]:
nlp = spacy.load('nl_core_news_lg')

In [11]:
def containsNumbers(sentence):
    return len(re.findall('\d', sentence.text))

def avgWordLen(sentence):
    if len(sentence.text.split()) == 0:
        return 0
    return sum(len(word) for word in sentence.text.split()) / len(sentence.text.split())

def isQuote(sentence):
    return len(re.findall('\"\w*\"', sentence.text))

def encodeLabels(ents):
    labels = pd.DataFrame()
    for ent in ents:
        labels.append([ent.label_, ent.text])
    enc = OneHotEncoder(handle_unknown='ignore')
    if len(labels) != 0:
        enc.fit(labels)
        return pd.DataFrame(enc.transform(labels), colums=enc.get_feature_names())
    return []

def getPos(token):
    return token.pos

def getWord(token):
    return token.text

def getParameters(sentences, expectedOutcome):
    params = [dict() for x in range(len(sentences))]
    for idx, sentence in enumerate(sentences):
        params[idx]['length'] = len(sentence)
        params[idx]['containsNumbers'] = containsNumbers(sentence)
        params[idx]['avgWordLen'] = avgWordLen(sentence)
        #params[idx]['originalSentence'] = sentence.text
        #params[idx]['labels'] = encodeLabels(sentence.ents)
        #params[idx]['isQuote'] = isQuote(sentence)
        params[idx]['locationInText'] = idx/len(sentences)
        if expectedOutcome == '0':
            params[idx]['isExpected'] = None
        else:
            if sentence.text in expectedOutcome: 
                params[idx]['isExpected'] = True
            else:
                params[idx]['isExpected'] = False
        
        posTags = map(getPos,sentence)
        counter = Counter(list(posTags))
        for c in counter.most_common():
            params[idx]['pos' + str(c[0])] = c[1]

        #wordFreq = map(getWord,sentence)
        #print(list(wordFreq))
        #counter = Counter(list(wordFreq))
        #for c in counter.most_common():
        #    params[idx]['word' + str(c[0])] = c[1]

    return params


In [12]:
dataset = pd.read_csv('TestData/DataSetArticles.csv')
dataset = dataset.dropna()

params = pd.DataFrame()

for i, article in dataset.iterrows():
    expected = article.EXPECTED_OUTCOME
    #print(i, article)
    docx = nlp(article.INTRO)
    sentence_list = [ sentence for sentence in docx.sents ]
    df = pd.DataFrame(data=getParameters(sentence_list, expected))
    params = params.append(df, ignore_index=True)

    docx = nlp(article.TEXT)
    sentence_list = [ sentence for sentence in docx.sents ]
    df = pd.DataFrame(data=getParameters(sentence_list, expected))
    params = params.append(df, ignore_index=True)


print(params.describe())
params.to_csv('out.csv')

             length  containsNumbers    avgWordLen  isQuote  locationInText  \
count  12643.000000     12643.000000  12643.000000  12643.0    12643.000000   
mean      13.183896         0.392787      4.732445      0.0        0.492486   
std        9.182670         1.431045      2.005446      0.0        0.289200   
min        1.000000         0.000000      0.000000      0.0        0.000000   
25%        6.000000         0.000000      4.250000      0.0        0.243590   
50%       12.000000         0.000000      5.000000      0.0        0.493827   
75%       19.000000         0.000000      5.687500      0.0        0.742915   
max       73.000000        22.000000     22.000000      0.0        0.996516   

             pos92        pos90         pos97        pos86        pos87  ...  \
count  9417.000000  7987.000000  11512.000000  6673.000000  5621.000000  ...   
mean      2.830944     2.111681      1.746265     1.771767     1.330368  ...   
std       1.775436     1.280984      1.098084   

In [6]:
print(params.labels)

AttributeError: 'DataFrame' object has no attribute 'labels'