In [1]:
import polars as pl
from elfen.extractor import Extractor
import pandas as pd
from pathlib import Path
import glob
import nltk

In [2]:
dataset = Path("dataset").glob("sub*_2_*")
dataframes = [pd.read_csv(f) for f in dataset]
merged_df = pd.concat(dataframes, ignore_index=True)
merged_df.to_csv("merged_data_2.csv", index=False)

In [3]:
df = pl.read_csv("merged_data_2.csv") # load our 200-text csv file here
df = df.rename({"short_text": "text"}) # change the column name of "short_text" to "text", since this is the name the "Extractor" function can identify
extractor = Extractor(data = df) # extract the features

In [None]:
# extract features based on intended feature groups
extractor.extract_feature_group(feature_group = ["surface",
                                                 "pos",
                                                 "lexical_richness",
                                                 "readability",
                                                 "information",
                                                 "entities",
                                                 "semantic",
                                                 "emotion",
                                                 "psycholinguistic",
                                                 "morphological",
                                                 "dependency"  
                                                 ])
print(extractor.data.head())

Extracting n_hedges...
Extracting sentiment_score...
Extracting n_positive_sentiment...
Extracting n_negative_sentiment...
Extracting avg_valence...
Extracting avg_arousal...
Extracting avg_dominance...
Extracting n_low_valence...
Extracting n_high_valence...
Extracting n_low_arousal...
Extracting n_high_arousal...
Extracting n_low_dominance...
Extracting n_high_dominance...
Extracting avg_emotion_intensity...
Extracting n_low_intensity...
Extracting n_high_intensity...
Extracting avg_concreteness...
Extracting n_high_concreteness...
Extracting n_low_concreteness...
Extracting avg_sd_concreteness...
Extracting n_controversial_concreteness...
Extracting avg_aoa...
Extracting n_high_aoa...
Extracting n_low_aoa...
Extracting avg_sd_aoa...
Extracting n_controversial_aoa...
Extracting avg_prevalence...
Extracting n_high_prevalence...
Extracting n_low_prevalence...
Extracting avg_socialness...
Extracting n_high_socialness...
Extracting n_low_socialness...
Extracting avg_sd_socialness...
Extr

In [12]:
# We normalize all the values
extractor.normalize("all")

In [13]:
# check what our extracted feature dataset look like
extractor.data.head()

text,id,label,data,is_attention_check,expected_answer,nlp,raw_sequence_length,n_tokens,n_lemmas,n_sentences,n_types,n_characters,avg_word_length,n_long_words,tokens_per_sentence,n_lexical_tokens,pos_variability,n_adj,n_adp,n_adv,n_aux,n_conj,n_cconj,n_det,n_intj,n_noun,n_num,n_part,n_pron,n_propn,n_punct,n_sconj,n_sym,n_verb,n_x,lemma_token_ratio,…,n_Aux_Case_Add,n_Aux_Case_Ade,n_Aux_Case_All,n_Aux_Case_Del,n_Aux_Case_Ela,n_Aux_Case_Ill,n_Aux_Case_Ine,n_Aux_Case_Lat,n_Aux_Case_Loc,n_Aux_Case_Per,n_Aux_Case_Sbe,n_Aux_Case_Sub,n_Aux_Case_Sup,n_Aux_Case_Ter,n_Aux_Abbr_Yes,n_Aux_Foreign_Yes,n_PUNCT_PunctType_Brck,n_PUNCT_PunctType_Comm,n_PUNCT_PunctType_Dash,n_PUNCT_PunctType_Excl,n_PUNCT_PunctType_Peri,n_PUNCT_PunctType_Qest,n_PUNCT_PunctType_Quot,n_PUNCT_PunctType_Semi,n_PUNCT_PunctType_Symb,n_PUNCT_PunctSide_Fin,n_PUNCT_PunctSide_Ini,n_CONJ_ConjType_Cmp,n_CONJ_ConjType_Oper,n_CCONJ_ConjType_Cmp,n_CCONJ_ConjType_Oper,n_SCONJ_ConjType_Cmp,n_SCONJ_ConjType_Oper,n_ADP_AdpType_Prep,n_ADP_AdpType_Post,n_ADP_AdpType_Circ,n_ADP_AdpType_Voc
str,str,str,str,bool,str,object,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""Slowly and gradually, all face…","""5febe40a7f4bf6f477af654d3eea9a…","""male""","""PAN13-EN""",False,,"Slowly and gradually, all facets of their lives can be accomplished by way of the Net. Meeting possible dates and dating can now be accomplished in the virtual globe.Nowadays online dating internet sites have enjoyed a large next since its inceptions a number of decades back. Folks are quick view...",0.396155,-0.147564,0.171612,-0.11652,0.099136,0.493215,0.530212,1.033346,-0.270764,0.66064,-0.0207,0.980786,0.014631,0.886848,1.12092,,0.365078,-0.06815,-0.318585,0.275603,-0.522536,-1.192911,-1.157632,-0.061745,-0.430398,-0.188054,-0.178871,-0.533701,-0.186568,0.57855,…,,,,,,,,,,,,,,,,,-0.291241,-0.257004,-0.311954,,0.332719,,-0.224721,,,-0.338405,-0.363098,,,0.365078,,,,,,,
"""What a wonderful day we had. W…","""4168492""","""1""","""BLOG""",False,,"What a wonderful day we had. We ventured to Six Flags over Georgia. It wasn't quite as I remember it when I was 20 years old...then again that was 16 years ago. I can remember when I went as a kid, it was huge and exciting and I wanted to do everything and there was never enough time. Now that I'...",0.396155,0.953659,0.494392,1.880972,0.405741,0.264779,0.333549,-0.845465,-0.797467,-0.050682,-0.376877,0.505829,-0.821436,1.333623,0.670751,,1.14184,-1.144203,-0.318585,-1.579419,2.743316,0.182203,2.313928,0.499577,0.255496,2.319329,-0.178871,0.630738,-0.186568,-0.873563,…,,,,,,,,,,,,,,,,,-0.291241,-0.257004,-0.311954,,0.332719,,-0.224721,,,-0.338405,-0.363098,,,1.14184,,,,,,,
"""This text will discuss the key…","""28c118646abaf08b33535fad28820d…","""male""","""PAN13-EN""",False,,This text will discuss the key variations between the two that will help you resolve which may be better for you. Perceive that my opinion is based on my private experience and the analysis that I've conducted.Liquid Diets (And Why One Of These Technically Is not One): I imagine that individuals ...,0.396155,-0.000734,0.279205,-0.782351,0.303539,0.451681,0.490717,0.563644,0.457516,-1.651156,0.184481,0.030872,-0.403403,-1.347026,1.571088,,0.365078,-0.426834,-0.318585,-1.314416,2.743316,-0.505354,0.978713,1.060899,-0.430398,1.483535,-0.178871,-0.533701,-0.186568,0.466752,…,,,,,,,,,,,,,,,,,1.129448,-0.841106,-0.311954,,-0.240934,,-0.224721,,,-0.338405,1.498944,,,0.365078,,,,,,,
"""It was a day of sightseeing fo…","""2036_3""","""Female""","""PASTEL""",False,,It was a day of sightseeing for us!. We saw several different exhibits. This was cool because the ferris wheel was in the background. I loved watching the children on the merry-go-round. We even saw characters interacting!,-0.937181,-0.808298,-0.79673,0.54931,-0.922882,-0.856637,-0.516734,-0.375762,-0.898906,-0.939834,-0.05235,0.505829,-0.403403,-0.900251,-0.229586,,-1.188446,-0.06815,-0.318585,-0.519406,-0.522536,-1.192911,-0.089459,-0.623068,0.598443,-0.188054,-0.178871,-0.921848,-0.186568,0.040493,…,,,,,,,,,,,,,,,,,-0.291241,-0.841106,4.144533,,1.480025,,-0.224721,,,-0.338405,-0.363098,,,-1.188446,,,,,,,
"""it behoves on us all to make a…","""8b794cc8b9857b8704d9dc4eb5d086…","""female""","""PAN13-EN""",False,,"it behoves on us all to make a drastic and prompt actions to make change a reality, in this part of the world called 'Nigeria' chauvinism is killing us",-2.150859,-1.909521,-1.549885,-2.114013,-1.740496,-2.164956,-2.430972,-0.610614,2.187181,-2.184647,2.675509,-0.919041,-0.821436,-1.347026,-1.129923,,-0.411684,-0.426834,-0.318585,-1.314416,-0.522536,0.86976,-0.623545,-0.061745,-1.459239,-1.023848,-0.178871,-0.921848,-0.186568,1.497156,…,,,,,,,,,,,,,,,,,-0.291241,-0.257004,-0.311954,,-1.961894,,1.179786,,,-0.338405,1.498944,,,-0.411684,,,,,,,


In [14]:
extractor.data.schema


Schema([('text', String),
        ('id', String),
        ('label', String),
        ('data', String),
        ('is_attention_check', Boolean),
        ('expected_answer', String),
        ('nlp', Object),
        ('raw_sequence_length', Float64),
        ('n_tokens', Float64),
        ('n_lemmas', Float64),
        ('n_sentences', Float64),
        ('n_types', Float64),
        ('n_characters', Float64),
        ('avg_word_length', Float64),
        ('n_long_words', Float64),
        ('tokens_per_sentence', Float64),
        ('n_lexical_tokens', Float64),
        ('pos_variability', Float64),
        ('n_adj', Float64),
        ('n_adp', Float64),
        ('n_adv', Float64),
        ('n_aux', Float64),
        ('n_conj', Float64),
        ('n_cconj', Float64),
        ('n_det', Float64),
        ('n_intj', Float64),
        ('n_noun', Float64),
        ('n_num', Float64),
        ('n_part', Float64),
        ('n_pron', Float64),
        ('n_propn', Float64),
        ('n_punct', Float6

In [15]:
# this package has an issue to export the data in csv format, my solution is to convert it to pandas format first
df_pandas  = extractor.data.to_pandas()



In [16]:
# save our extracted features in a new csv file
df_pandas.to_csv("full_features.csv", index=False)

In [17]:
df_pandas

Unnamed: 0,text,id,label,data,is_attention_check,expected_answer,nlp,raw_sequence_length,n_tokens,n_lemmas,...,n_CONJ_ConjType_Cmp,n_CONJ_ConjType_Oper,n_CCONJ_ConjType_Cmp,n_CCONJ_ConjType_Oper,n_SCONJ_ConjType_Cmp,n_SCONJ_ConjType_Oper,n_ADP_AdpType_Prep,n_ADP_AdpType_Post,n_ADP_AdpType_Circ,n_ADP_AdpType_Voc
0,"Slowly and gradually, all facets of their live...",5febe40a7f4bf6f477af654d3eea9ab3,male,PAN13-EN,False,,"(Slowly, and, gradually, ,, all, facets, of, t...",0.396155,-0.147564,0.171612,...,,,0.365078,,,,,,,
1,What a wonderful day we had. We ventured to Si...,4168492,1,BLOG,False,,"(What, a, wonderful, day, we, had, ., We, vent...",0.396155,0.953659,0.494392,...,,,1.141840,,,,,,,
2,This text will discuss the key variations betw...,28c118646abaf08b33535fad28820db0,male,PAN13-EN,False,,"(This, text, will, discuss, the, key, variatio...",0.396155,-0.000734,0.279205,...,,,0.365078,,,,,,,
3,It was a day of sightseeing for us!. We saw se...,2036_3,Female,PASTEL,False,,"(It, was, a, day, of, sightseeing, for, us, !,...",-0.937181,-0.808298,-0.796730,...,,,-1.188446,,,,,,,
4,it behoves on us all to make a drastic and pro...,8b794cc8b9857b8704d9dc4eb5d0863f,female,PAN13-EN,False,,"(it, behoves, on, us, all, to, make, a, drasti...",-2.150859,-1.909521,-1.549885,...,,,-0.411684,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,technology in schools is for everyone The A...,682d9b4fc0ab9c306a99f4b8af476023,female,PAN13-EN,False,,"( , technology, in, schools, is, for, everyon...",0.396155,0.513170,-0.258762,...,,,-0.411684,,,,,,,
196,I dunno. Sometimes I just think I'm doing too ...,3441444,0,BLOG,False,,"(I, dunno, ., Sometimes, I, just, think, I, 'm...",0.396155,1.027074,1.247547,...,,,1.141840,,,,,,,
197,"As intelligent as I am, I overlooked a key set...",3287062,0,BLOG,False,,"(As, intelligent, as, I, am, ,, I, overlooked,...",0.396155,0.439755,0.386799,...,,,-1.188446,,,,,,,
198,"Todae, Sundae nite.... again mus think of tml....",4256898,1,BLOG,False,,"(Todae, ,, Sundae, nite, ...., again, mus, thi...",0.396155,0.880244,0.279205,...,,,1.918602,,,,,,,
