In [2]:
import pandas as pd

# Charger le dataset
df = pd.read_csv('/content/sampled_data.csv')

# Afficher les premières lignes du dataset
print(df.head())

# Afficher les colonnes du dataset
print("Colonnes du dataset :")
print(df.columns)


      Category                                            Content
0  Middle East  ""Sadness, anger in Palestine: Gaza mourns 13 ...
1  Middle East  Sudan’s doctors battle to keep people alive,""...
2  Middle East  How has the world reacted to Syria rejoining t...
3  Middle East  Police question two Tunisian journalists after...
4  Middle East  United Nations backs Sudan envoy as army seeks...
Colonnes du dataset :
Index(['Category', 'Content'], dtype='object')


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

# Fonction de prétraitement du texte
def preprocess_text(text):
    # Convertir en minuscules
    text = text.lower()
    # Supprimer les caractères spéciaux et les nombres
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Supprimer les espaces multiples
    text = re.sub(r'\s+', ' ', text)
    # Supprimer les stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Charger et prétraiter les données
def prepare_data(df):
    # Prétraiter le texte
    df['processed_content'] = df['Content'].apply(preprocess_text)

    # Encoder les labels
    le = LabelEncoder()
    df['encoded_category'] = le.fit_transform(df['Category'])

    return df, le

# Créer le pipeline de classification
def create_baseline_model(df, le):
    # Diviser les données
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_content'],
        df['encoded_category'],
        test_size=0.2,
        random_state=42,
        stratify=df['encoded_category']
    )

    # Créer et entraîner le vectoriseur TF-IDF
    tfidf = TfidfVectorizer(
        max_features=5000,  # Limiter le nombre de features
        min_df=5,          # Ignorer les termes qui apparaissent dans moins de 5 documents
        max_df=0.95,       # Ignorer les termes qui apparaissent dans plus de 95% des documents
        ngram_range=(1, 2) # Utiliser des unigrammes et des bigrammes
    )

    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    # Créer et entraîner le modèle
    model = LogisticRegression(
        multi_class='multinomial',
        max_iter=1000,
        random_state=42
    )

    model.fit(X_train_tfidf, y_train)

    # Évaluer le modèle
    y_pred = model.predict(X_test_tfidf)

    # Afficher les résultats
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Identifier les features les plus importantes pour chaque classe
    feature_importance = pd.DataFrame(
        model.coef_,
        columns=tfidf.get_feature_names_out(),
        index=le.classes_
    )

    return model, tfidf, feature_importance, X_train_tfidf, X_test_tfidf, y_train, y_test

# Fonction pour obtenir les features les plus importantes
def get_top_features(feature_importance, n_features=10):
    top_features = {}
    for category in feature_importance.index:
        top_features[category] = feature_importance.loc[category].nlargest(n_features)
    return top_features

# Exécution principale
if __name__ == "__main__":
    # Préparation des données
    df, label_encoder = prepare_data(df)

    # Création et évaluation du modèle
    model, tfidf, feature_importance, X_train_tfidf, X_test_tfidf, y_train, y_test = create_baseline_model(df, label_encoder)

    # Obtenir les features les plus importantes pour chaque classe
    top_features = get_top_features(feature_importance)

    # Afficher les features les plus importantes pour chaque classe
    print("\nTop features par catégorie:")
    for category, features in top_features.items():
        print(f"\n{category}:")
        print(features)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Classification Report:
                      precision    recall  f1-score   support

              Africa       0.72      0.65      0.68       100
                Asia       0.82      0.78      0.80       100
      Climate Crisis       0.76      0.86      0.81       100
         Coronavirus       0.69      0.76      0.72       100
             Economy       0.59      0.61      0.60       100
              Europe       0.73      0.59      0.65       100
         Middle East       0.90      0.82      0.86       100
             Opinion       0.98      1.00      0.99       100
Science & Technology       0.65      0.64      0.65       100
              Sports       0.77      0.90      0.83       100

            accuracy                           0.76      1000
           macro avg       0.76      0.76      0.76      1000
        weighted avg       0.76      0.76      0.76      1000


Top features par catégorie:

Africa:
african      3.983930
rwanda       2.343691
nigeria      2.276642
af

In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

def prepare_data(df):
    df['processed_content'] = df['Content'].apply(preprocess_text)

    le = LabelEncoder()
    df['encoded_category'] = le.fit_transform(df['Category'])

    return df, le

def get_tfidf_importance(X_tfidf, feature_names, category_docs):
    """
    Calcule l'importance TF-IDF moyenne pour chaque feature dans une catégorie
    """
    category_tfidf = X_tfidf[category_docs]
    tfidf_means = np.asarray(category_tfidf.mean(axis=0)).flatten()

    importance_df = pd.DataFrame({
        'feature': feature_names,
        'tfidf_importance': tfidf_means
    })
    return importance_df.set_index('feature')['tfidf_importance']

def create_baseline_model(df, le):
    # Diviser les données
    X_train, X_test, y_train, y_test = train_test_split(
        df['processed_content'],
        df['encoded_category'],
        test_size=0.2,
        random_state=42,
        stratify=df['encoded_category']
    )

    # Créer et entraîner le vectoriseur TF-IDF
    tfidf = TfidfVectorizer(
        max_features=5000,
        min_df=5,
        max_df=0.95,
        ngram_range=(1, 2)
    )

    X_train_tfidf = tfidf.fit_transform(X_train)
    X_test_tfidf = tfidf.transform(X_test)

    # Créer et entraîner le modèle
    model = LogisticRegression(
        multi_class='multinomial',
        max_iter=1000,
        random_state=42
    )

    model.fit(X_train_tfidf, y_train)

    # Évaluer le modèle
    y_pred = model.predict(X_test_tfidf)

    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=le.classes_))

    # Calculer l'importance des features pour le modèle
    model_importance = pd.DataFrame(
        model.coef_,
        columns=tfidf.get_feature_names_out(),
        index=le.classes_
    )

    # Calculer l'importance TF-IDF pour chaque catégorie
    feature_names = tfidf.get_feature_names_out()
    tfidf_importance_by_category = {}

    for category in le.classes_:
        category_idx = le.transform([category])[0]
        category_docs = y_train == category_idx
        tfidf_importance = get_tfidf_importance(X_train_tfidf, feature_names, category_docs)
        tfidf_importance_by_category[category] = tfidf_importance

    return model, tfidf, model_importance, tfidf_importance_by_category, X_train_tfidf, X_test_tfidf, y_train, y_test

def compare_feature_importance(model_importance, tfidf_importance_by_category, n_features=10):
    comparisons = {}

    for category in model_importance.index:
        # Obtenir les top features selon le modèle
        model_top = model_importance.loc[category].nlargest(n_features)

        # Obtenir les top features selon TF-IDF
        tfidf_top = tfidf_importance_by_category[category].nlargest(n_features)

        # Créer un DataFrame de comparaison
        comparison_df = pd.DataFrame({
            'Model_Importance': model_top,
            'TFIDF_Importance': tfidf_importance_by_category[category][model_top.index],
            'TFIDF_Rank': pd.Series(range(1, len(tfidf_importance_by_category[category]) + 1),
                                  index=tfidf_importance_by_category[category].sort_values(ascending=False).index)[model_top.index]
        })

        # Ajouter les top features TF-IDF qui ne sont pas dans les top features du modèle
        tfidf_only = tfidf_top[~tfidf_top.index.isin(model_top.index)]
        if not tfidf_only.empty:
            tfidf_comparison = pd.DataFrame({
                'Model_Importance': model_importance.loc[category][tfidf_only.index],
                'TFIDF_Importance': tfidf_only,
                'TFIDF_Rank': range(1, len(tfidf_only) + 1)
            })
            comparison_df = pd.concat([comparison_df, tfidf_comparison])

        comparisons[category] = comparison_df

    return comparisons

if __name__ == "__main__":
    # Préparation des données
    df, label_encoder = prepare_data(df)

    # Création et évaluation du modèle
    model, tfidf, model_importance, tfidf_importance_by_category, X_train_tfidf, X_test_tfidf, y_train, y_test = create_baseline_model(df, label_encoder)

    # Comparer les importances
    comparisons = compare_feature_importance(model_importance, tfidf_importance_by_category)

    # Afficher les résultats
    print("\nComparaison de l'importance des features par catégorie:")
    for category, comparison in comparisons.items():
        print(f"\n{'-'*50}")
        print(f"Catégorie: {category}")
        print(f"{'-'*50}")
        print("\nTop features selon le modèle avec leur importance TF-IDF:")
        formatted_comparison = comparison.copy()
        formatted_comparison['Model_Importance'] = formatted_comparison['Model_Importance'].round(4)
        formatted_comparison['TFIDF_Importance'] = formatted_comparison['TFIDF_Importance'].round(4)
        print(formatted_comparison.sort_values('Model_Importance', ascending=False))

        # Calculer les statistiques de chevauchement
        model_top = set(model_importance.loc[category].nlargest(10).index)
        tfidf_top = set(tfidf_importance_by_category[category].nlargest(10).index)
        overlap = model_top.intersection(tfidf_top)

        print(f"\nStatistiques de chevauchement:")
        print(f"Nombre de features communes dans le top 10: {len(overlap)}")
        print(f"Features communes: {list(overlap)}")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Classification Report:
                      precision    recall  f1-score   support

              Africa       0.72      0.65      0.68       100
                Asia       0.82      0.78      0.80       100
      Climate Crisis       0.76      0.86      0.81       100
         Coronavirus       0.69      0.76      0.72       100
             Economy       0.59      0.61      0.60       100
              Europe       0.73      0.59      0.65       100
         Middle East       0.90      0.82      0.86       100
             Opinion       0.98      1.00      0.99       100
Science & Technology       0.65      0.64      0.65       100
              Sports       0.77      0.90      0.83       100

            accuracy                           0.76      1000
           macro avg       0.76      0.76      0.76      1000
        weighted avg       0.76      0.76      0.76      1000


Comparaison de l'importance des features par catégorie:

------------------------------------------------