In [1]:
import polars as pl
from elfen.extractor import Extractor
import pandas as pd
from pathlib import Path
import glob
import nltk

In [2]:
dataset = Path("dataset").glob("sub*_2_*")
dataframes = [pd.read_csv(f) for f in dataset]
merged_df = pd.concat(dataframes, ignore_index=True)
merged_df.to_csv("merged_data_2.csv", index=False)

In [None]:
# df = pl.read_csv("merged_data_2.csv") # load our 200-text csv file here
df = pl.read_csv("pi.csv")
df = df.rename({"short_text": "text"}) # change the column name of "short_text" to "text", since this is the name the "Extractor" function can identify
extractor = Extractor(data = df) # extract the features

In [51]:
# extract features based on intended feature groups
extractor.extract_feature_group(feature_group = ["surface",
                                                 "pos",
                                                 "lexical_richness",
                                                 "readability",
                                                 "information",
                                                 "entities",
                                                 "semantic",
                                                 "emotion",
                                                 "psycholinguistic",
                                                 "morphological",
                                                 "dependency"  
                                                 ])
print(extractor.data.head())

Extracting n_hedges...
Extracting sentiment_score...
Extracting n_positive_sentiment...
Extracting n_negative_sentiment...
Extracting avg_valence...
Extracting avg_arousal...
Extracting avg_dominance...
Extracting n_low_valence...
Extracting n_high_valence...
Extracting n_low_arousal...
Extracting n_high_arousal...
Extracting n_low_dominance...
Extracting n_high_dominance...
Extracting avg_emotion_intensity...
Extracting n_low_intensity...
Extracting n_high_intensity...
Extracting avg_concreteness...
Extracting n_high_concreteness...
Extracting n_low_concreteness...
Extracting avg_sd_concreteness...
Extracting n_controversial_concreteness...
Extracting avg_aoa...
Extracting n_high_aoa...
Extracting n_low_aoa...
Extracting avg_sd_aoa...
Extracting n_controversial_aoa...
Extracting avg_prevalence...
Extracting n_high_prevalence...
Extracting n_low_prevalence...
Extracting avg_socialness...
Extracting n_high_socialness...
Extracting n_low_socialness...
Extracting avg_sd_socialness...
Extr

In [52]:
# We normalize all the values
extractor.normalize("all")

In [53]:
# check what our extracted feature dataset look like
extractor.data.head()

P_i,text_id,text,nlp,raw_sequence_length,n_tokens,n_lemmas,n_sentences,n_types,n_characters,avg_word_length,n_long_words,tokens_per_sentence,n_lexical_tokens,pos_variability,n_adj,n_adp,n_adv,n_aux,n_conj,n_cconj,n_det,n_intj,n_noun,n_num,n_part,n_pron,n_propn,n_punct,n_sconj,n_sym,n_verb,n_x,lemma_token_ratio,ttr,cttr,rttr,…,n_dependency_aux,n_dependency_auxpass,n_dependency_case,n_dependency_cc,n_dependency_ccomp,n_dependency_compound,n_dependency_conj,n_dependency_csubj,n_dependency_csubjpass,n_dependency_dative,n_dependency_dep,n_dependency_det,n_dependency_dobj,n_dependency_expl,n_dependency_intj,n_dependency_mark,n_dependency_meta,n_dependency_neg,n_dependency_nounmod,n_dependency_npmod,n_dependency_nsubj,n_dependency_nsubjpass,n_dependency_nummod,n_dependency_oprd,n_dependency_parataxis,n_dependency_pcomp,n_dependency_pobj,n_dependency_poss,n_dependency_preconj,n_dependency_predet,n_dependency_prep,n_dependency_prt,n_dependency_punct,n_dependency_quantmod,n_dependency_relcl,n_dependency_root,n_dependency_xcomp
f64,str,str,object,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,…,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
0.222222,"""10003""","""There was a huge freeway where…",There was a huge freeway where many people were driving. One person was sick so he got out of his car to stretch his legs. All the cars were stopped and people started taking pictures of him. Others sat down with him to take pictures. People were so excited they decided to start laying down in the middle of the freeway.,0.300987,0.166018,-0.234108,0.380745,-0.044785,0.274428,0.104979,0.957452,-0.398648,0.598077,-0.168572,-0.009391,0.934358,-0.480598,0.022381,,-0.458053,-0.113024,-0.370884,0.413134,0.396478,0.715383,-0.03303,-0.594849,-0.525314,-0.230392,-0.16118,1.213488,-0.173742,-1.001726,-0.70504,-0.241174,-0.241174,…,0.451306,0.978484,-0.332713,-0.466514,-0.162926,-0.851235,-0.452639,-0.264378,-0.062684,-0.268598,-0.302806,-0.41034,-0.135661,1.509859,-0.355691,0.116557,-0.079384,-0.599932,,,0.230517,1.161775,0.674549,-0.219646,-0.202706,-0.55426,0.198326,0.447862,-0.133899,3.037195,0.410858,2.39507,-0.539926,-0.20767,0.243013,,1.976872
0.288889,"""10003_1""","""We were driving down a huge fr…","We were driving down a huge freeway. We had to stop sometimes because we felt sick or had to stretch our legs. We stopped for a photo op, too. My sister made sure to sit in the photo op. My other sister decided to lay in the freeway.",-0.738432,-0.456549,-0.861559,0.380745,-0.869342,-0.811753,-0.937244,-0.610755,-0.67397,-0.047603,0.298794,-0.009391,-0.511841,-0.480598,-1.119028,,-0.458053,-0.431488,-0.370884,-0.543499,-0.498229,1.335869,-0.03303,-0.594849,-0.255787,-0.230392,-0.16118,0.915807,-0.173742,-0.904275,-1.084261,-1.198998,-1.198998,…,0.888054,-0.556983,-0.332713,-0.466514,-0.838474,0.129849,-0.452639,-0.264378,-0.062684,-0.268598,-0.302806,-0.41034,-0.626348,-0.382102,-0.355691,0.116557,-0.079384,-0.599932,,,-0.103471,-0.51671,-0.442399,-0.219646,-0.202706,-0.55426,-0.620672,1.140012,-0.133899,-0.258485,-0.732075,0.896315,-0.276673,-0.20767,-0.764602,,2.927112
0.222222,"""10092""","""The parade was full of a lot o…",The parade was full of a lot of people holding on to these floats. Some displayed their silly hats. In the parade the people held on tight to their sticks. The parade also put on a nice show for the spectators. This was definitely a hat goofy event.,-0.549447,-0.508429,-1.04083,0.380745,-0.951798,-0.580035,-0.187653,-0.414729,-0.696913,-0.435011,-0.32201,-0.444809,1.295907,-0.063043,-0.738559,,-1.172448,1.160833,-0.370884,0.413134,-0.498229,-1.146073,-0.689332,-0.259259,-0.525314,-0.964767,-0.16118,-1.16796,-0.173742,-1.203615,-1.162967,-1.313651,-1.313651,…,-1.295685,-0.556983,-0.332713,-1.196337,-0.838474,0.129849,-1.093876,-0.264378,-0.062684,-0.268598,-0.302806,1.222058,-0.626348,-0.382102,-0.355691,-0.784114,-0.079384,-0.599932,,,-0.437459,-0.51671,-0.442399,-0.219646,-0.202706,-0.55426,0.607825,0.447862,-0.133899,-0.258485,0.791836,2.39507,-0.539926,-0.20767,-0.764602,,-0.873848
0.177778,"""10232_0""","""It was just a normal day and t…","It was just a normal day and the neighbors were driving home. They loved the street which they called home for the past few years. The houses were beautiful, safe, and elegant. Their neighborhood had changed significantly over the last twenty years as each house become bright and lively with color. That is until the neighbors passed the new white house which had just been built, undoubtedly by a disturbed soul seeking to disrupt the calm life of the neighborhood.",2.025478,1.307389,1.200066,0.380745,1.356962,2.113694,1.033693,1.741556,0.106108,1.760301,-0.869622,3.473949,-0.150291,1.189623,1.163789,,0.970736,2.116226,-0.370884,0.413134,0.396478,-0.525588,-0.03303,0.07633,0.283268,0.503983,-0.16118,0.320445,-0.173742,-0.694755,-0.468028,1.081072,1.081072,…,0.451306,0.978484,-0.332713,0.993131,-0.838474,-0.360693,1.471075,-0.264378,-0.062684,-0.268598,-0.302806,2.201498,0.355027,-0.382102,-0.355691,1.017229,-0.079384,-0.599932,,,0.898494,1.161775,0.674549,-0.219646,-0.202706,-0.55426,0.198326,-0.244288,-0.133899,-0.258485,0.029881,-0.602441,0.249832,-0.20767,1.250628,,0.076392
0.377778,"""1023c3a657f870432e8142b3bf704e""","""Earning a residual money is po…",Earning a residual money is powerful and a great digital recruiter can do that by doing work with corporations placing contractors (those who are ready to get the job done on a brief-expression basis for an hourly amount as a substitute of a wage).A organization is nothing at all with out its sta...,0.052944,-0.300907,0.393343,-1.937822,0.367494,0.086156,0.914753,0.369374,4.625968,0.081533,0.08233,1.296861,0.934358,-0.480598,0.022381,,-0.458053,0.523905,-0.370884,0.173976,-0.498229,-0.525588,-0.470564,-0.259259,-1.064369,-0.964767,-0.16118,-0.572598,-0.173742,1.555598,1.769871,1.130382,1.130382,…,-0.42219,-0.556983,-0.332713,-0.466514,-0.838474,0.129849,-0.452639,2.800008,-0.062684,-0.268598,-0.302806,0.569099,0.845714,-0.382102,-0.355691,-0.784114,-0.079384,-0.599932,,,-0.771447,-0.51671,-0.442399,-0.219646,-0.202706,4.236803,0.198326,-0.244288,-0.133899,-0.258485,0.791836,-0.602441,-1.066431,-0.20767,0.243013,,0.076392


In [54]:
extractor.data.schema


Schema([('P_i', Float64),
        ('text_id', String),
        ('text', String),
        ('nlp', Object),
        ('raw_sequence_length', Float64),
        ('n_tokens', Float64),
        ('n_lemmas', Float64),
        ('n_sentences', Float64),
        ('n_types', Float64),
        ('n_characters', Float64),
        ('avg_word_length', Float64),
        ('n_long_words', Float64),
        ('tokens_per_sentence', Float64),
        ('n_lexical_tokens', Float64),
        ('pos_variability', Float64),
        ('n_adj', Float64),
        ('n_adp', Float64),
        ('n_adv', Float64),
        ('n_aux', Float64),
        ('n_conj', Float64),
        ('n_cconj', Float64),
        ('n_det', Float64),
        ('n_intj', Float64),
        ('n_noun', Float64),
        ('n_num', Float64),
        ('n_part', Float64),
        ('n_pron', Float64),
        ('n_propn', Float64),
        ('n_punct', Float64),
        ('n_sconj', Float64),
        ('n_sym', Float64),
        ('n_verb', Float64),
        (

In [55]:
# this package has an issue to export the data in csv format, my solution is to convert it to pandas format first
df_pandas  = extractor.data.to_pandas()



In [56]:
# save our extracted features in a new csv file
df_pandas.to_csv("full_features.csv", index=False)

In [57]:
df_pandas

Unnamed: 0,P_i,text_id,text,nlp,raw_sequence_length,n_tokens,n_lemmas,n_sentences,n_types,n_characters,...,n_dependency_poss,n_dependency_preconj,n_dependency_predet,n_dependency_prep,n_dependency_prt,n_dependency_punct,n_dependency_quantmod,n_dependency_relcl,n_dependency_root,n_dependency_xcomp
0,0.222222,10003,There was a huge freeway where many people wer...,"(There, was, a, huge, freeway, where, many, pe...",0.300987,0.166018,-0.234108,0.380745,-0.044785,0.274428,...,0.447862,-0.133899,3.037195,0.410858,2.395070,-0.539926,-0.20767,0.243013,,1.976872
1,0.288889,10003_1,We were driving down a huge freeway. We had to...,"(We, were, driving, down, a, huge, freeway, .,...",-0.738432,-0.456549,-0.861559,0.380745,-0.869342,-0.811753,...,1.140012,-0.133899,-0.258485,-0.732075,0.896315,-0.276673,-0.20767,-0.764602,,2.927112
2,0.222222,10092,The parade was full of a lot of people holding...,"(The, parade, was, full, of, a, lot, of, peopl...",-0.549447,-0.508429,-1.040830,0.380745,-0.951798,-0.580035,...,0.447862,-0.133899,-0.258485,0.791836,2.395070,-0.539926,-0.20767,-0.764602,,-0.873848
3,0.177778,10232_0,It was just a normal day and the neighbors wer...,"(It, was, just, a, normal, day, and, the, neig...",2.025478,1.307389,1.200066,0.380745,1.356962,2.113694,...,-0.244288,-0.133899,-0.258485,0.029881,-0.602441,0.249832,-0.20767,1.250628,,0.076392
4,0.377778,1023c3a657f870432e8142b3bf704e,Earning a residual money is powerful and a gre...,"(Earning, a, residual, money, is, powerful, an...",0.052944,-0.300907,0.393343,-1.937822,0.367494,0.086156,...,-0.244288,-0.133899,-0.258485,0.791836,-0.602441,-1.066431,-0.20767,0.243013,,0.076392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
505,0.377778,fc98fd96a1b45709f54713eddb7060b6,"This truly is an excellent question, why is a ...","(This, truly, is, an, excellent, question, ,, ...",0.052944,-0.352788,-0.054836,-0.778538,-0.044785,0.158568,...,-0.244288,-0.133899,3.037195,0.410858,-0.602441,-0.013421,-0.20767,-0.764602,,-0.873848
506,0.222222,fcfed6ffa020a9c6783a7b232b8028cc,"If you are looking for Vuitton Damier, where...","(If, you, are, looking, for, , Vuitton, Dami...",0.052944,-0.041504,0.662251,-0.778538,0.532405,0.086156,...,-0.936439,-0.133899,-0.258485,-0.351097,-0.602441,0.249832,-0.20767,1.250628,,-0.873848
507,0.177778,fdf4388adbb23c962bb7733144f0710a,Should you be looking for a formidable volleyb...,"(Should, you, be, looking, for, a, formidable,...",0.052944,0.010376,-0.323744,-0.778538,-0.292152,0.013744,...,-0.936439,-0.133899,-0.258485,1.172814,-0.602441,-0.539926,-0.20767,-0.764602,,0.076392
508,0.266667,fe345e018455de3756fa7601508c1bf8,"Along with the terminology, the mechanics of ...","(Along, with, the, terminology, ,, the, , mec...",0.052944,-0.560310,-0.503015,-0.778538,-0.621975,0.259945,...,-0.244288,-0.133899,-0.258485,0.410858,0.896315,0.513084,-0.20767,-0.764602,,-0.873848
