In [1]:
import pandas as pd
import pdpipe as pdp

In [2]:
df = pd.DataFrame(
    data=[
        [23, 'Living life to its fullest'],
        [23, 'the pen is mightier than the sword'],
        [25, 'all for one and one for all'],
        [44, 'every life is precious'],
        [72,'all of you get off my porch'],
        [50,'boy do I love dogs and cats'],
        [80, 'I gots the dollarz'],
        [80, 'me likey them moniez'],
    ],
    columns=['Age', 'Quote'],
)

In [3]:
df

Unnamed: 0,Age,Quote
0,23,Living life to its fullest
1,23,the pen is mightier than the sword
2,25,all for one and one for all
3,44,every life is precious
4,72,all of you get off my porch
5,50,boy do I love dogs and cats
6,80,I gots the dollarz
7,80,me likey them moniez


In [4]:
pipeline_stages = [
    pdp.TokenizeWords('Quote'),
    pdp.SnowballStem('EnglishStemmer', columns=['Quote']),
    pdp.RemoveStopwords('English', 'Quote'),
    pdp.TfidfVectorizeTokenLists('Quote'),
]

In [5]:
pipeline = pdp.PdPipeline(pipeline_stages)

In [6]:
pipeline

A pdpipe pipeline:
[ 0]  Tokenize Quote
[ 1]  Stem tokens in Quote
[ 2]  Remove stopwords from Quote
[ 3]  Count-vectorizing column Quote.

In [7]:
res_df = pipeline(df, verbose=True)
res_df

- Tokenizing Quote...
- Stemming tokens in Quote...
- Removing stopwords from Quote...
- Count-vectorizing column Quote...


Unnamed: 0,Age,Quote_0,Quote_1,Quote_2,Quote_3,Quote_4,Quote_5,Quote_6,Quote_7,Quote_8,Quote_9,Quote_10,Quote_11,Quote_12,Quote_13,Quote_14,Quote_15,Quote_16,Quote_17,Quote_18
0,23,0.0,0.0,0.0,0.0,0.0,0.608313,0.0,0.0,0.509814,0.0,0.608313,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0,0.57735
2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,44,0.0,0.0,0.0,0.0,0.608313,0.0,0.0,0.0,0.509814,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.608313,0.0
4,72,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0
5,50,0.5,0.5,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,80,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,80,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.707107,0.0,0.0,0.0,0.0,0.0


In [8]:
res_df.dtypes

Age                        int64
Quote_0     Sparse[float64, 0.0]
Quote_1     Sparse[float64, 0.0]
Quote_2     Sparse[float64, 0.0]
Quote_3     Sparse[float64, 0.0]
Quote_4     Sparse[float64, 0.0]
Quote_5     Sparse[float64, 0.0]
Quote_6     Sparse[float64, 0.0]
Quote_7     Sparse[float64, 0.0]
Quote_8     Sparse[float64, 0.0]
Quote_9     Sparse[float64, 0.0]
Quote_10    Sparse[float64, 0.0]
Quote_11    Sparse[float64, 0.0]
Quote_12    Sparse[float64, 0.0]
Quote_13    Sparse[float64, 0.0]
Quote_14    Sparse[float64, 0.0]
Quote_15    Sparse[float64, 0.0]
Quote_16    Sparse[float64, 0.0]
Quote_17    Sparse[float64, 0.0]
Quote_18    Sparse[float64, 0.0]
dtype: object

In [9]:
df2 = pd.DataFrame(
    data=[
        [45, 'love life and cats newword pen'],
        [16, 'the pen is more precious than dollarz'],
    ],
    columns=['Age', 'Quote'],
)

In [10]:
res_df2 = pipeline(df2, verbose=True)
res_df2

- Tokenizing Quote...
- Stemming tokens in Quote...
- Removing stopwords from Quote...
- Count-vectorizing column Quote...


Unnamed: 0,Age,Quote_0,Quote_1,Quote_2,Quote_3,Quote_4,Quote_5,Quote_6,Quote_7,Quote_8,Quote_9,Quote_10,Quote_11,Quote_12,Quote_13,Quote_14,Quote_15,Quote_16,Quote_17,Quote_18
0,45,0.0,0.519708,0.0,0.0,0.0,0.0,0.0,0.0,0.435556,0.0,0.0,0.519708,0.0,0.0,0.0,0.519708,0.0,0.0,0.0
1,16,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.57735,0.0


In [11]:
df3 = pd.DataFrame(
    data=[
        [23, ['live', 'full', 'cats', 'mango']],
        [80, ['hovercraft', 'full', 'eels']],
    ],
    columns=['Age', 'Quote'],
)

In [12]:
tf = pdp.TfidfVectorizeTokenLists('Quote')

In [13]:
tf(df3)

Unnamed: 0,Age,Quote_0,Quote_1,Quote_2,Quote_3,Quote_4,Quote_5
0,23,0.534046,0.0,0.379978,0.0,0.534046,0.534046
1,80,0.0,0.631667,0.449436,0.631667,0.0,0.0


In [14]:
for i, row in df3.iterrows():
    print(row['Quote'])

['live', 'full', 'cats', 'mango']
['hovercraft', 'full', 'eels']


In [15]:
row

Age                            80
Quote    [hovercraft, full, eels]
Name: 1, dtype: object

In [16]:
row[0]

80

In [17]:
res_df3 = tf(df3)

In [18]:
res_df3

Unnamed: 0,Age,Quote_0,Quote_1,Quote_2,Quote_3,Quote_4,Quote_5
0,23,0.534046,0.0,0.379978,0.0,0.534046,0.534046
1,80,0.0,0.631667,0.449436,0.631667,0.0,0.0


In [19]:
(res_df3 > 0).T.sum().values

array([5., 4.])

In [20]:
df4 = pd.DataFrame(
    data=[
        [23, ['hovercraft', 'eels']],
        [23, ['eels', 'urethra']],
    ],
    columns=['Age', 'tokens'],
)
df4

Unnamed: 0,Age,tokens
0,23,"[hovercraft, eels]"
1,23,"[eels, urethra]"


In [21]:
data = [[2, ['hovercraft', 'eels']], [5, ['eels', 'urethra']]]
df4 = pd.DataFrame(data, [1, 2], ['Age', 'tokens'])
df4

Unnamed: 0,Age,tokens
1,2,"[hovercraft, eels]"
2,5,"[eels, urethra]"


In [22]:
tf2 = pdp.TfidfVectorizeTokenLists('tokens')

In [23]:
tf2(df4)

Unnamed: 0,Age,tokens_0,tokens_1,tokens_2
1,2,0.579739,0.814802,0.0
2,5,0.579739,0.0,0.814802
