In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer

https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.Y5R1cOzMLVY




In natural language processing (NLP), the term frequency-inverse document frequency (TF-IDF) score is a measure of the importance of a word or phrase to the meaning of a document (or group of documents) in a collection. The TF-IDF score is calculated by combining the term frequency (TF) and the inverse document frequency (IDF) of the word or phrase.

The term frequency (TF) of a word or phrase is the number of times it appears in a document. This value reflects how important the word is to the meaning of the document.

The inverse document frequency (IDF) of a word or phrase is a measure of how frequently it appears in the entire collection of documents. This value reflects how common the word is across the collection of documents. Words that appear frequently in the collection are given a lower IDF score, while words that appear infrequently are given a higher IDF score.

The TF-IDF score is calculated by multiplying the TF and IDF values for a word or phrase. This score reflects both the importance of the word to the meaning of the document and its rarity across the collection of documents. Words and phrases with high TF-IDF scores are considered to be more important and relevant to the meaning of the document than those with low TF-IDF scores.

In summary, the difference between the IDF and the TF-IDF score is that the IDF is a measure of the rarity of a word or phrase across a collection of documents, while the TF-IDF score is a measure of the importance of the word or phrase to the meaning of a specific document in the collection. The TF-IDF score combines both of these factors to give a more complete picture of the significance of a word or phrase to the meaning of a document.

In [2]:
# Load the text data into a DataFrame
data = pd.read_csv('../../data/p_content.csv')


# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(data['content'])



# get the first vector out (for the first document) 
first_vector_tfidfvectorizer=tfidf_vectorizer_vectors[0] 
# place tf-idf values in a pandas data frame 
df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(), columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)



Unnamed: 0,tfidf
stoff,0.392697
kategorie,0.240359
schützen,0.208823
biologisch,0.196349
umgang,0.160012
...,...
fernunterricht,0.000000
fernsehsender,0.000000
fernandez,0.000000
ferienstimmung,0.000000


The lower the IDF value of a word, the less unique it is to any particular document

In [3]:
#instantiate CountVectorizer() 
cv=CountVectorizer() 
# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(data['content'])
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names_out(),columns=["idf_weights"]) 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])


Unnamed: 0,idf_weights
maskenpflicht,1.009662
corona,1.101096
werden,1.201740
mehr,1.249942
geben,1.287682
...,...
großaufgebot,4.951244
großarl,4.951244
grosslieferung,4.951244
großversammlungen,4.951244


build feature frame

In [4]:
# Convert the matrix of TF-IDF values to a DataFrame
tfidf_df = pd.DataFrame(tfidf_vectorizer_vectors.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

newcol = ['tfidf_'+n for n in tfidf_df.columns]
#multicol1 = pd.MultiIndex.from_tuples(newcol)

tfidf_df.columns = newcol

tfidf_df.head()

Unnamed: 0,tfidf_0000,tfidf_006121,tfidf_006721,tfidf_024,tfidf_033,tfidf_06,tfidf_079,tfidf_0800,tfidf_082,tfidf_0920,...,tfidf_überwiegen,tfidf_überwinden,tfidf_überzeugen,tfidf_üblich,tfidf_übrig,tfidf_übrigens,tfidf_übriges,tfidf_übung,tfidf_üge,tfidf_üssel
0,0.0,0.0,0.0,0.0,0.0,0.06009,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.044654,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043459,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.060782,0.0,0.0


In [24]:
df_merged = data[['ID_GodotObject']].merge(tfidf_df, left_index=True, right_index=True)
df_merged = df_merged.T.reset_index(drop=False)
df_merged.columns = df_merged.iloc[0]
df_merged.head()

Unnamed: 0,ID_GodotObject,2000115059032.0,2000116305030.0,2000116325081.0,2000116346340.0,2000116371728.0,2000116569218.0,2000116643454.0,2000116717900.0,2000116807446.0,...,2000129581597.0,2000129584367.0,2000129604031.0,2000129703306.0,2000129831411.0,2000130275144.0,2000130436224.0,2000130505680.0,2000130732406.0,2000130816685.0
0,ID_GodotObject,2000115000000.0,2000116000000.0,2000116000000.0,2000116000000.0,2000116000000.0,2000117000000.0,2000117000000.0,2000117000000.0,2000117000000.0,...,2000130000000.0,2000130000000.0,2000130000000.0,2000130000000.0,2000130000000.0,2000130000000.0,2000130000000.0,2000131000000.0,2000131000000.0,2000131000000.0
1,tfidf_0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tfidf_006121,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tfidf_006721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tfidf_024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


https://www.kaggle.com/code/pierremegret/gensim-word2vec-tutorial/notebook

In [25]:
df_merged.to_csv('../../data/feature/tfidf_content.csv', encoding='utf-8', index=False)

In [7]:
# = df_merged.set_index(['ID_GodotObject']).T.sort_values(by=[2000115059032], ascending=False)
#df_merged_sorted.to_csv('../../data/feature/tfidf_content.csv', encoding='utf-8', index=False)
#df_merged_sorted.head()

In [26]:
count_vect_df = pd.DataFrame(word_count_vector.todense(), columns=cv.get_feature_names_out())
count_vect_df = data[['ID_GodotObject']].merge(count_vect_df, left_index=True, right_index=True)
count_vect_df = count_vect_df.T.reset_index(drop=False)
count_vect_df.columns = count_vect_df.iloc[0]
count_vect_df.to_csv('../../data/feature/wordcount_content.csv', encoding='utf-8', index=False)
count_vect_df.head()

Unnamed: 0,ID_GodotObject,2000115059032,2000116305030,2000116325081,2000116346340,2000116371728,2000116569218,2000116643454,2000116717900,2000116807446,...,2000129581597,2000129584367,2000129604031,2000129703306,2000129831411,2000130275144,2000130436224,2000130505680,2000130732406,2000130816685
0,ID_GodotObject,2000115059032,2000116305030,2000116325081,2000116346340,2000116371728,2000116569218,2000116643454,2000116717900,2000116807446,...,2000129581597,2000129584367,2000129604031,2000129703306,2000129831411,2000130275144,2000130436224,2000130505680,2000130732406,2000130816685
1,0000,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,006121,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,006721,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,024,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
