In [30]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer

In [2]:

example_df = pd.DataFrame({"Person":
                   ["John", "Myla", "Lewis", "John", "Myla"],
                   "Age": [24., 26, np.nan, 33, 26],
                   "Single": [False, True, True, False, False]})
example_df

Unnamed: 0,Person,Age,Single
0,John,24.0,False
1,Myla,26.0,True
2,Lewis,,True
3,John,33.0,False
4,Myla,26.0,False


## <center> Counting values in a group <center>

 Group the data according to one column 
 
 Count number of values of each column per group: NaN is considered as missing value

In [3]:
example_df.groupby('Person').count()

Unnamed: 0_level_0,Age,Single
Person,Unnamed: 1_level_1,Unnamed: 2_level_1
John,2,2
Lewis,0,1
Myla,2,2


Count number of rows in each group (irrespective of whether there are values in the columns or not)

In [4]:
example_df[['Person']].groupby('Person').value_counts()

Person
John     2
Lewis    1
Myla     2
dtype: int64

In [5]:
# or also simply
example_df['Person'].value_counts()

John     2
Myla     2
Lewis    1
Name: Person, dtype: int64

In [6]:
import nltk
# if it give errors do this
# nltk.download('punkt')

## <center> Counting words in a sentence <center>


In [1]:
gvn_str = "hello this is Python, welcome all. Hello again. Repeating words: hello , Python"
gvn_str2 = 'second string for the data frame'
gvn_str3 = 'this is the third string for the test'

### Count using Counter from collections

In [8]:
# this tokenize does basically the same as split() but it works better with punctuation marks
from nltk.tokenize import word_tokenize
word_tokenize(gvn_str)

['hello',
 'this',
 'is',
 'Python',
 ',',
 'welcome',
 'all',
 '.',
 'Hello',
 'again',
 '.',
 'Repeating',
 'words',
 ':',
 'hello',
 ',',
 'Python']

In [9]:
gvn_str.split()

['hello',
 'this',
 'is',
 'Python,',
 'welcome',
 'all.',
 'Hello',
 'again.',
 'Repeating',
 'words:',
 'hello',
 ',',
 'Python']

In [10]:
# After having a list with words to count use Counter from collections

from collections import Counter 
Counter(gvn_str.split())

Counter({'hello': 2,
         'this': 1,
         'is': 1,
         'Python,': 1,
         'welcome': 1,
         'all.': 1,
         'Hello': 1,
         'again.': 1,
         'Repeating': 1,
         'words:': 1,
         ',': 1,
         'Python': 1})

In [11]:
Counter(word_tokenize(gvn_str))

Counter({'hello': 2,
         'this': 1,
         'is': 1,
         'Python': 2,
         ',': 2,
         'welcome': 1,
         'all': 1,
         '.': 2,
         'Hello': 1,
         'again': 1,
         'Repeating': 1,
         'words': 1,
         ':': 1})

In [12]:
Counter('hello hello hello'.split())

Counter({'hello': 3})

### Count using CountVectorizer from sklearn from a single string

##### Get matrix

In [20]:
#from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

# if the input is a string then place it between brackets to hanve an iterable
matrix = vectorizer.fit_transform([gvn_str])
matrix

<1x9 sparse matrix of type '<class 'numpy.int64'>'
	with 9 stored elements in Compressed Sparse Row format>

##### Visualize the matrix as a data frame

In [21]:
counts_df = pd.DataFrame(matrix.toarray(),
                      columns=vectorizer.get_feature_names())
counts_df



Unnamed: 0,again,all,hello,is,python,repeating,this,welcome,words
0,1,1,3,1,2,1,1,1,1


##### Visualize the matrix as a dict

_**option 1**: convert to df and then to dict_

In [27]:
# Visualize the matrix as a dict using to_dict() from the dataframe
pd.DataFrame(matrix.toarray(),
                      columns=vectorizer.get_feature_names()).iloc[0,:]



again        1
all          1
hello        3
is           1
python       2
repeating    1
this         1
welcome      1
words        1
Name: 0, dtype: int64

In [26]:
pd.DataFrame(matrix.toarray(),
                      columns=vectorizer.get_feature_names()).iloc[0,:].to_dict()



{'again': 1,
 'all': 1,
 'hello': 3,
 'is': 1,
 'python': 2,
 'repeating': 1,
 'this': 1,
 'welcome': 1,
 'words': 1}

_**option 2**: convert directly to dict_

In [28]:
# to convert it to dict directly from the matrix
# the elements inside the list vectorizer.get_feature_names() have to be taken out of hte tuple
# the elements inside matrix.toarray() have to be taken out of the array to access the list: matrix.toarray()[0]
matrix.toarray()

array([[1, 1, 3, 1, 2, 1, 1, 1, 1]])

In [29]:
# here it is clear that iterating over matrix.toarray() is not what we need
[(x,y) for x,y in zip(matrix.toarray(),matrix.toarray())]

[(array([1, 1, 3, 1, 2, 1, 1, 1, 1]), array([1, 1, 3, 1, 2, 1, 1, 1, 1]))]

In [32]:
# we need to iterate over the elements in the list, not over the array
# so the dict can be created by:
dict(
    zip(vectorizer.get_feature_names_out()
        , matrix.toarray()[0])
)

{'again': 1,
 'all': 1,
 'hello': 3,
 'is': 1,
 'python': 2,
 'repeating': 1,
 'this': 1,
 'welcome': 1,
 'words': 1}

### Count using CountVectorizer from sklearn from a column of a data frame

In [27]:
# define data frame
str_df = pd.DataFrame([[gvn_str,gvn_str2, gvn_str3],['t1','t2','t3'],[1,2,3]]
                      , index=['text','id','counter']
                      , columns=['Text1', 'Text2', 'Text3']).T
str_df

Unnamed: 0,text,id,counter
Text1,"hello this is Python, welcome all. Hello again...",t1,1
Text2,second string for the data frame,t2,2
Text3,this is the third string for the test,t3,3


In [28]:
vectorizer2_df = CountVectorizer()

# if the input is a string then place it between brackets to hanve an iterable
# .astype('U') is needed to make sure there are no errors with the type
matrix2_df = vectorizer2_df.fit_transform(str_df['text'].astype('U'))
matrix2_df

<3x17 sparse matrix of type '<class 'numpy.int64'>'
	with 22 stored elements in Compressed Sparse Row format>

In [29]:
counts2_df = pd.DataFrame(matrix2_df.toarray(),
                      columns=vectorizer2_df.get_feature_names_out())
counts2_df



Unnamed: 0,again,all,data,for,frame,hello,is,python,repeating,second,string,test,the,third,this,welcome,words
0,1,1,0,0,0,3,1,2,1,0,0,0,0,0,1,1,1
1,0,0,1,1,1,0,0,0,0,1,1,0,1,0,0,0,0
2,0,0,0,1,0,0,1,0,0,0,1,1,2,1,1,0,0


# <center> Sort data frame based on a row <center>

In [34]:
counts2_df.sort_values(by=0, axis=1)

Unnamed: 0,data,for,frame,third,the,test,string,second,again,this,repeating,is,all,welcome,words,python,hello
0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,3
1,1,1,1,0,1,0,1,1,0,0,0,0,0,0,0,0,0
2,0,1,0,1,2,1,1,0,0,1,0,1,0,0,0,0,0


In [36]:
counts2_df.sort_values(by=1, axis=1)

Unnamed: 0,again,this,third,test,welcome,python,repeating,hello,all,is,words,frame,second,string,for,the,data
0,1,1,0,0,1,2,1,3,1,1,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1
2,0,1,1,1,0,0,0,0,0,1,0,0,0,1,1,2,0


In [37]:
counts2_df.sort_values(by=2, axis=1)

Unnamed: 0,again,second,welcome,python,hello,repeating,data,all,frame,words,is,for,string,test,third,this,the
0,1,0,1,2,3,1,0,1,0,1,1,0,0,0,0,1,0
1,0,1,0,0,0,0,1,0,1,0,0,1,1,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,2


<br>

__comparing performance with other options__

In [39]:
%timeit counts2_df.sort_values(by=0, axis=1)

154 µs ± 2.08 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [42]:
# np.argsort returns the indices one would use to sort the row with index 0,
%timeit counts2_df.iloc[:, np.argsort(counts2_df.loc[0])]

270 µs ± 7.35 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [1]:
free -h

NameError: name 'free' is not defined