In [None]:
import pandas as pd
df = pd.read_json("/Users/cameliamazouz/Documents/M2/machine_learning/multinli_1.0/multinli_1.0_train.jsonl",lines=True)

In [20]:
df.columns

Index(['annotator_labels', 'genre', 'gold_label', 'pairID', 'promptID',
       'sentence1', 'sentence1_binary_parse', 'sentence1_parse', 'sentence2',
       'sentence2_binary_parse', 'sentence2_parse'],
      dtype='object')

#### lemmatisation 


In [21]:
import nltk
nltk.download('punkt_tab')      
nltk.download('wordnet')    
nltk.download('omw-1.4') 
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/cameliamazouz/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/cameliamazouz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/cameliamazouz/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/cameliamazouz/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/cameliamazouz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [22]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemma=[]
    for t in tokens:
        if not(t.lower() in stop_words):
            lemma.append(lemmatizer.lemmatize(t.lower()))
    return ' '.join(lemma)

df["lemma1"] = df["sentence1"].apply(tokenize_and_lemmatize)
df["lemma2"] = df["sentence2"].apply(tokenize_and_lemmatize)

import re

def common_words_count(s1, s2):
    w1 = set(re.findall(r"\w+", s1.lower()))
    w2 = set(re.findall(r"\w+", s2.lower()))
    return len(w1 & w2)

df['same_lemma'] = df.apply(
    lambda row: common_words_count(row['lemma1'], row['lemma2']),
    axis=1
)

# Ratio de mots communs (normalisé)
def common_words_ratio(s1, s2):
    w1 = set(re.findall(r"\w+", s1.lower()))
    w2 = set(re.findall(r"\w+", s2.lower()))
    if len(w1) == 0 or len(w2) == 0:
        return 0
    return len(w1 & w2) / min(len(w1), len(w2))

df['same_lemma_ratio'] = df.apply(
    lambda row: common_words_ratio(row['lemma1'], row['lemma2']),
    axis=1
)

#### Vecteurs

In [23]:
label_map = {"entailment": 0, "neutral": 1, "contradiction": 2}
df["label_id"] = df["gold_label"].map(label_map)

In [24]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Nettoyage rapide (les NaNs font planter le vectorizer)
df["sentence1"] = df["sentence1"].fillna("")
df["sentence2"] = df["sentence2"].fillna("")

# 2. Création du Vectorizer
# On peut ajouter stop_words='english' pour virer les "the", "is", "a"...
vectorizer = CountVectorizer(stop_words='english', max_features=1000) 

# 3. Apprentissage du vocabulaire sur TOUT le texte (S1 + S2)
# C'est crucial pour que la colonne 42 corresponde au mot "apple" dans les deux vecteurs
all_text = pd.concat([df["sentence1"], df["sentence2"]])
vectorizer.fit(all_text)

# 4. Transformation en vecteurs de fréquence
# X1 = Fréquence des mots dans sentence1
# X2 = Fréquence des mots dans sentence2 (en utilisant le même dictionnaire que S1)
X1 = vectorizer.transform(df["sentence1"])
X2 = vectorizer.transform(df["sentence2"])


##### Similarite textuelle

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import cosine
import numpy as np

# Similarité cosinus entre les phrases
def cosine_similarity_feature(s1, s2, vectorizer):
    v1 = vectorizer.transform([s1]).toarray()[0]
    v2 = vectorizer.transform([s2]).toarray()[0]
    return 1 - cosine(v1, v2) if np.sum(v1) > 0 and np.sum(v2) > 0 else 0

df['cosine_sim'] = df.apply(
    lambda row: cosine_similarity_feature(row['sentence1'], row['sentence2'], vectorizer),
    axis=1
)

# Ratio de longueur (au lieu de différence absolue)
df['length_ratio'] = df['sentence1'].str.len() / (df['sentence2'].str.len() + 1)

# Jaccard similarity sur les mots
def jaccard_similarity(s1, s2):
    w1 = set(re.findall(r"\w+", s1.lower()))
    w2 = set(re.findall(r"\w+", s2.lower()))
    if len(w1 | w2) == 0:
        return 0
    return len(w1 & w2) / len(w1 | w2)

df['jaccard'] = df.apply(
    lambda row: jaccard_similarity(row['sentence1'], row['sentence2']),
    axis=1
)

##### TF-IDF

In [26]:
# Remplacer CountVectorizer par TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=2000, ngram_range=(1, 2))
all_text = pd.concat([df["sentence1"], df["sentence2"]])
tfidf.fit(all_text)

X1_tfidf = tfidf.transform(df["sentence1"])
X2_tfidf = tfidf.transform(df["sentence2"])

In [27]:
import pandas as pd

# On choisit l'index que tu voulais (1)
idx = 1

# 1. Récupérer le dictionnaire (la liste des mots dans l'ordre des colonnes)
vocabulaire = vectorizer.get_feature_names_out()

# 2. Récupérer les comptes pour la ligne choisie
# .flatten() permet d'aplatir le tableau (convertir [[0,1]] en [0,1])
compte_s1 = X1[idx].toarray().flatten()
compte_s2 = X2[idx].toarray().flatten()

# 3. Créer un tableau propre pour l'affichage
df_visu = pd.DataFrame({
    'Mot': vocabulaire,
    'S1 (Freq)': compte_s1,
    'S2 (Freq)': compte_s2
})

# 4. FILTRAGE : On n'affiche que les mots présents dans au moins l'une des phrases
# (Sinon on va afficher 990 lignes de zéros)
mask = (df_visu['S1 (Freq)'] > 0) | (df_visu['S2 (Freq)'] > 0)
df_resultat = df_visu[mask].sort_values(by='S1 (Freq)', ascending=False)

# --- AFFICHAGE ---
# print(f"--- Analyse de l'index {idx} ---\n")
# print(f"Phrase 1 : \"{df['sentence1'].iloc[idx]}\"")
# print(f"Phrase 2 : \"{df['sentence2'].iloc[idx]}\"\n")
# print("Mots comptés (intersection avec le vocabulaire connu) :")
# print(df_resultat.to_string(index=False))

##### Features lexi

In [28]:
def overlap_ratio(s1, s2):
    w1 = set(s1.split())
    w2 = set(s2.split())
    return len(w1 & w2) / min(len(w1), len(w2)) if min(len(w1), len(w2)) > 0 else 0
df['overlap_ratio'] = df.apply(
    lambda row: overlap_ratio(row['lemma1'], row['lemma2']),
    axis=1)

def common_ratio(s1, s2):
    w1 = set(s1.split())
    w2 = set(s2.split())
    common = len(w1 & w2)
    return common / ((len(w1) + len(w2)) / 2) if (len(w1) + len(w2)) / 2 > 0 else 0
df['common_ratio'] = df.apply(
    lambda row: common_ratio(row['lemma1'], row['lemma2']),
    axis=1)
neg_words = {'no','not','never','none','nothing','nobody','without'}

def neg_count(s):
    return sum(1 for w in s.split() if w in neg_words)

df['neg1'] = df['lemma1'].apply(neg_count)
df['neg2'] = df['lemma2'].apply(neg_count)
df['neg_diff'] = abs(df['neg1'] - df['neg2'])


##### Logistic Regression

In [29]:
# from scipy.sparse import hstack
# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import accuracy_score, classification_report
# import re

# # Créer les features dif et same
# df["dif"] = (
#     df['sentence1'].str.len() -
#     df['sentence2'].str.len()
# ).abs()

# def common_words_count(s1, s2):
#     w1 = set(re.findall(r"\w+", s1.lower()))
#     w2 = set(re.findall(r"\w+", s2.lower()))
#     return len(w1 & w2)

# df['same'] = df.apply(
#     lambda row: common_words_count(row['sentence1'], row['sentence2']),
#     axis=1
# )


# # Nouvelles features à ajouter
# new_features = ['cosine_sim', 'length_ratio', 'jaccard', 
#                  'same_lemma_ratio']

# X_combined = hstack([
#     X1_tfidf, 
#     X2_tfidf, 
#     df[['dif'] + new_features].values
# ])

# y = df["label_id"]

# print(f"Split des données (10% test sur {len(df)} lignes)...")
# X_train, X_test, y_train, y_test = train_test_split(
#     X_combined, y, test_size=0.10, random_state=42
# )

# # 6. ENTRAÎNEMENT (Régression Logistique)
# print("Entraînement du modèle...")
# model = LogisticRegression(max_iter=1000)
# model.fit(X_train, y_train)

# # 7. ÉVALUATION
# predictions = model.predict(X_test)
# acc = accuracy_score(y_test, predictions)

# print(f"\n--- RÉSULTATS ---")
# print(f"Accuracy (Précision globale) : {acc:.4f} ({acc*100:.2f}%)")

# # Détail par classe
# target_names = ["Entailment (0)", "Neutral (1)", "Contradiction (2)"]
# print("\nDétail par classe :")
# print(classification_report(y_test, predictions, target_names=target_names))

##### SVM

In [None]:
from scipy.sparse import hstack
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import re

# Créer les features dif et same
df["dif"] = (
    df['sentence1'].str.len() -
    df['sentence2'].str.len()
).abs()

def common_words_count(s1, s2):
    w1 = set(re.findall(r"\w+", s1.lower()))
    w2 = set(re.findall(r"\w+", s2.lower()))
    return len(w1 & w2)

df['same'] = df.apply(
    lambda row: common_words_count(row['sentence1'], row['sentence2']),
    axis=1
)

new_features = ['cosine_sim', 'length_ratio', 'jaccard', 
                 'same_lemma_ratio','neg1','neg2','neg_diff']

X_combined = hstack([
    X1_tfidf, 
    X2_tfidf, 
    df[['dif'] + new_features].values
])

y = df["label_id"]

print(f"Split des données (10% test sur {len(df)} lignes)...")
X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.10, random_state=42
)

# 6. ENTRAÎNEMENT (SVM)
print("Entraînement du modèle SVM...")
model = SVC(kernel='linear', C=1.0, random_state=42)
model.fit(X_train, y_train)

# 7. ÉVALUATION
predictions = model.predict(X_test)
acc = accuracy_score(y_test, predictions)

print(f"\n--- RÉSULTATS ---")
print(f"Accuracy (Précision globale) : {acc:.4f} ({acc*100:.2f}%)")

# Détail par classe
target_names = ["Entailment (0)", "Neutral (1)", "Contradiction (2)"]
print("\nDétail par classe :")
print(classification_report(y_test, predictions, target_names=target_names))

Split des données (10% test sur 1000 lignes)...
Entraînement du modèle SVM...

--- RÉSULTATS ---
Accuracy (Précision globale) : 0.4200 (42.00%)

Détail par classe :
                   precision    recall  f1-score   support

   Entailment (0)       0.47      0.55      0.51        31
      Neutral (1)       0.39      0.31      0.35        29
Contradiction (2)       0.39      0.40      0.40        40

         accuracy                           0.42       100
        macro avg       0.42      0.42      0.42       100
     weighted avg       0.42      0.42      0.42       100

