*** Method for filtering lexicon L2 ***

In [74]:
import pandas as pd
import pyprind
import os

def extract_lexicon(doc, sent_emotion, file_length, only_accepted=True):
    """ Extract words with specified sentiment/emotion from a given file.
    
    Parameters
    ----------
    doc : str,
        Path to target input file.
    
    sent_emotion : str, {'positive', 'negative', 'anger', 'fear', 'anticipation', 'trust', 'surprise', 'sadness', 'joy', 'disgust'}
          Specifies what kind of words are going to be extracted.
    
    file_length : int,
         Length of the input file.
    
    only_accepted : boolean,
        If 'True' the output will containt only words specified with given sentiment or emotion. Else if 'False', 
        the output will contain words + 0 or 1 indicator that indicates if specified sentiment/emotion is or isn't fulfilled.
    
    Returns
    ----------
    df : DataFrame
         Two-dimensional size-mutable, potentially heterogeneous tabular data structure that contains extracted words.
    """
    pbar = pyprind.ProgBar(file_length)
    df = pd.DataFrame()
    with open(doc, 'r') as infile:
        for line in infile:
            line = line.rstrip().split('\t')
            if(line[1] == sent_emotion):
                if(only_accepted == False):
                    df = df.append([[line[0], line[2]]], ignore_index=True)
                elif(int(line[2]) == 1):
                    df = df.append([line[0]], ignore_index=True)
            pbar.update()
    df.columns = ['word'] if only_accepted else ['word', 'annotation']
    return df


*** 1. Extracting words from lexicon that are annotated as 'positive' ***

In [75]:
df = extract_lexicon('./NRC-emotion-lexicon-wordlevel-alphabetized-v0.92_no_intro.txt', 'positive', only_accepted=True, file_length=141820)
df.to_csv('./l2_lexicon_positive_words.pos', index=False, encoding='utf-8')

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:02


In [73]:
df = pd.read_csv('./l2_lexicon_positive_words.pos', encoding='utf-8')
df.head(10)

Unnamed: 0,0
0,abba
1,ability
2,abovementioned
3,absolute
4,absolution
5,absorbed
6,abundance
7,abundant
8,academic
9,academy


*** 2. Extracting words from lexicon that are annotated as 'negative' ***

In [76]:
df = extract_lexicon('./NRC-emotion-lexicon-wordlevel-alphabetized-v0.92_no_intro.txt', 'negative', only_accepted=True, file_length=141820)
df.to_csv('./l2_lexicon_negative_words.neg', index=False, encoding='utf-8')

0%                          100%
[##############################] | ETA: 00:00:00
Total time elapsed: 00:00:03


In [77]:
df = pd.read_csv('./l2_lexicon_negative_words.neg', encoding='utf-8')
df.head(10)

Unnamed: 0,word
0,abandon
1,abandoned
2,abandonment
3,abduction
4,aberrant
5,aberration
6,abhor
7,abhorrent
8,abject
9,abnormal
