In [15]:
import pandas as pd
import numpy as np
import math
import nltk
import re
import pickle
from discourseMarkers import rule
from  nltk.parse.corenlp import CoreNLPParser
from nltk import word_tokenize
from nltk import pos_tag
from nltk.tokenize import sent_tokenize
from tqdm import tqdm
from resources import lemmatizer, premise_conclusion_markers, english_dictionary, wordnet

# Create and register a new `tqdm` instance with `pandas`
tqdm.pandas()

  from pandas import Panel


In [2]:
data_raw = pd.read_csv("data/data.csv").drop(columns=['URLs'])

In [3]:
data = data_raw

In [4]:
def invert_quotes_fullstop(sentence):
    '''
    Substitute the expression '."' with '".' -> done to help sentence tokenization
    '''
    return sentence.replace('.”', '”.')

In [5]:
def split_sentences(article):
    sentences = []
    if type(article) == str:
        for period in article.split("\n"):
            for sentence in sent_tokenize(period):
                sentences.append(sentence)
    return sentences

In [6]:
def count_future_verbs(sentence):        
    text = word_tokenize(sentence)
    tagged = pos_tag(text)
    return len([word for word in tagged if word[1] in ["VBC", "VBF"]])

In [7]:
def count_premise_conclusion_markers(sentence):        
    counter = 0
    for marker in premise_conclusion_markers:
        if sentence.find(marker) != -1:
            counter += 1
    return counter

In [8]:
def get_wordnet_pos(treebank_tag):
    '''
    Helper function to convert a POS tag from the Penn Treebank to a WordNet format
    :param treebank_tag: Penn Treebank POS tag, string
    :returns: WordNet POS tag
    '''

    if treebank_tag.startswith("J"):
        return wordnet.ADJ
    elif treebank_tag.startswith("V"):
        return wordnet.VERB
    elif treebank_tag.startswith("N"):
        return wordnet.NOUN
    elif treebank_tag.startswith("R"):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [9]:
def lemmatize_words(words):
    '''
    Function to lemmatize word tokens using the WordNet lemmatizer
    :param words: list of word tokens in the tweet
    :returns: list of lemmatized word tokens
    '''

    lemmatized_words = []
    # Get the Penn Treebank POS tags for the word tokens
    word_pos_tags = pos_tag(words)
    for word, word_pos_tag in word_pos_tags:
        # Get the WordNet POS tag
        word_pos_tag = get_wordnet_pos(word_pos_tag)
        # Use the WordNet POS tag to lemmatize the word into the correct word form
        lemmatized_words.append(lemmatizer.lemmatize(word, word_pos_tag))
    return lemmatized_words

In [10]:
def get_word_tokens(sentence):
    '''
    Function to retrieve only tokens corresponding to real English words
    :param sentence: string corresponding to an English sentence
    :returns: parsed list of tokens with only word tokens
    '''
    tokens = word_tokenize(sentence)
    return [token for token in tokens if re.match(r"^[A-Za-z\']+$", token)]

In [11]:
def parse_tree(sentence):
    ''' Create the parse tree for the given sentence '''
    parser = CoreNLPParser()
    return next(parser.raw_parse(sentence))

In [12]:
def extract_linguisic_features_article(article):
    
    features = dict()
    
    if type(article) != str:
        features['number_of_future_verbs'] = float('nan')
        features['number_of_premise_conclusion_markers'] = float('nan')
        features['number_of_sentences'] = float('nan')
        features['average_sentence_lenght'] = float('nan')
        features['number_of_words'] = float('nan')
        features['average_word_lenght'] = float('nan')
        features['dictionary_percentage'] = float('nan')
        return features
    
    clean_article = invert_quotes_fullstop(article)
    
    sentences = split_sentences(clean_article)
    words = get_word_tokens(clean_article)
    
    # Compute the number of verbs at future tense
    features['number_of_future_verbs'] = count_future_verbs(clean_article)
    
    # Compute the number of premise and conclusion markers
    features['number_of_premise_conclusion_markers'] = count_premise_conclusion_markers(clean_article)
    
    # Compute the total number of sentences
    features['number_of_sentences'] = len(sentences)
    
    # Compute the average lenght of article's sentences
    features['average_sentence_lenght'] = np.mean([len(s.split()) for s in sentences])
    
    # Compute the number of words in the article
    num_words = len(words)
    features['number_of_words'] = num_words
    
    # Compute the average lenght of the words in the article
    features['average_word_lenght'] = np.mean([len(w) for w in words])
    
    # Compute the proportion of words belonging to the dictionary
    dictionary_words = [word for word in lemmatize_words(words) if word in english_dictionary]
    num_dictionary_words = len(dictionary_words)
    dictionary_percentage = num_dictionary_words / num_words if num_words > 0 else 0
    features['dictionary_percentage'] = dictionary_percentage
    
    return features

In [13]:
# Extract features from each article and put them in a dataframe
features_df = data.progress_apply(lambda x: pd.Series(extract_linguisic_features_article(x.Body)), axis=1)

100%|██████████| 4009/4009 [03:52<00:00, 17.24it/s]


In [22]:
features_df

Unnamed: 0,number_of_future_verbs,number_of_premise_conclusion_markers,number_of_sentences,average_sentence_lenght,number_of_words,average_word_lenght,dictionary_percentage
0,0.0,4.0,56.0,18.946429,1081.0,4.719704,0.823312
1,0.0,3.0,21.0,26.285714,538.0,4.657993,0.797398
2,0.0,2.0,54.0,20.111111,1070.0,4.824299,0.844860
3,0.0,3.0,3.0,32.333333,90.0,5.033333,0.877778
4,0.0,2.0,1.0,37.000000,41.0,4.243902,0.658537
...,...,...,...,...,...,...,...
4004,0.0,3.0,24.0,13.583333,322.0,4.298137,0.826087
4005,0.0,2.0,7.0,8.142857,52.0,3.903846,0.711538
4006,,,,,,,
4007,0.0,2.0,13.0,24.307692,311.0,5.032154,0.893891


In [26]:
features_df.to_csv("data/features.csv", index=False)

In [None]:
# Do not run, this is the code to store the discourse markers

discourse_markers = []

for i in tqdm(range(len(data_raw))):
    article_dms = []
    article = data_raw.loc[i].Body
    if type(article) == str:
        article = invert_quotes_fullstop(article)
        for s in split_sentences(article):
            try:
                clean_sentence = clean(s)
                if len(clean_sentence) > 450 or len(clean_sentence)==0:
                    continue
                if i==3477:
                    tree = parse_tree(clean_sentence[:330])
                else:
                    tree = parse_tree(clean_sentence)
                sentence_dms = rule(tree.pformat())
                if sentence_dms != 0:
                    article_dms += sentence_dms
            except IndexError:
                continue
    discourse_markers.append(article_dms)

with open('discourseMarkers.data'.format(40), 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(discourse_markers, filehandle)

In [20]:
# Load the discourse markers list
with open('discourseMarkers.data', 'rb') as filehandle:
    # read the data as binary data stream
    discourse_markers = pickle.load(filehandle)

In [21]:
len(discourse_markers)

4009