In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import numpy as np
import joblib

# Clickbait dataset
df = pd.read_csv('clickbait_data.csv')  

# Convert text to lower case and delete symbols
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

df['cleaned_headline'] = df['headline'].apply(clean_text)

# TF-IDF representation
tfidf_vectorizer = TfidfVectorizer(stop_words='english')  # Utilise la liste des mots vides pour l'anglais
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_headline'])  # Utilise 'cleaned_headline' pour les titres d'articles nettoyés


print("TF-IDF matrix size :", tfidf_matrix.shape)
print("TF-IDF vocabulary:", tfidf_vectorizer.get_feature_names_out())

# Save matrix and vector  
joblib.dump(tfidf_matrix, 'tfidf_matrix.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')


feature_names = tfidf_vectorizer.get_feature_names_out()
joblib.dump(feature_names, 'feature_names.pkl')


Taille de la matrice TF-IDF : (32000, 24458)
Vocabulaire TF-IDF : ['00' '000' '00s' ... 'ürümqi' 'śrī' 'šibenik']


['feature_names.pkl']