In [3]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

df = pd.read_json('multinli_1.0_train.jsonl', lines=True)


In [9]:
import pandas as pd

df2 = df[df['annotator_labels'].apply(
    lambda x: any(lbl in x for lbl in ['entailment', 'contradiction', 'neutral'])
)].copy()

def map_class(labels):
    if 'contradiction' in labels:
        return 1
    elif 'entailment' in labels:
        return 0
    elif 'neutral' in labels:
        return 2

df2['class'] = df2['annotator_labels'].apply(map_class)

df2['dif'] = (
    df2['sentence1'].str.len() -
    df2['sentence2'].str.len()
).abs()


In [10]:
import re

def common_words_count(s1, s2):
    w1 = set(re.findall(r"\w+", s1.lower()))
    w2 = set(re.findall(r"\w+", s2.lower()))
    return len(w1 & w2)

df2['same'] = df2.apply(
    lambda row: common_words_count(row['sentence1'], row['sentence2']),
    axis=1
)


# XOR

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
y = df2['class']
vectorizer = TfidfVectorizer(stop_words='english')

# Fit sur les deux colonnes réunies (pour un vocabulaire commun)
vectorizer.fit(pd.concat([df2['sentence1'], df2['sentence2']]))

X1 = vectorizer.transform(df2['sentence1'])
X2 = vectorizer.transform(df2['sentence2'])

# Concaténation des vecteurs
X = hstack([X1, X2])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
# Convertir en tenseurs PyTorch
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)  # si TF-IDF sparse
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)

# Transformer y en indices numériques si ce n'est pas déjà le cas
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_train_tensor = torch.tensor(le.fit_transform(y_train), dtype=torch.long)
y_test_tensor = torch.tensor(le.transform(y_test), dtype=torch.long)

# Créer DataLoader pour batch training
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [None]:
df2.columns

In [None]:
X = df2[["same", "same_lemma", "dif"]]
y = df2['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
X_test_tensor  = torch.tensor(X_test.values, dtype=torch.float32)
le = LabelEncoder()
y_train_tensor = torch.tensor(le.fit_transform(y_train), dtype=torch.long)
y_test_tensor = torch.tensor(le.transform(y_test), dtype=torch.long)

# Créer DataLoader pour batch training
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)


In [None]:
class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=3):  # 3 classes
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # softmax est incluse dans CrossEntropyLoss, donc pas besoin ici

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

input_dim = X_train_tensor.shape[1]
model = SimpleNN(input_dim)


In [None]:
criterion = nn.CrossEntropyLoss()  # multiclass
optimizer = optim.Adam(model.parameters(), lr=0.001)
epochs = 10

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")


In [None]:
model.eval()
correct = 0
total = 0

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()

print("Test Accuracy:", correct / total)


# Cours 2

In [32]:
import pandas as pd

pd.set_option('display.max_colwidth', None)

df = pd.read_json('multinli_1.0_train.jsonl', lines=True)


In [33]:
import nltk
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
def tokenize_and_lemmatize(text):
    tokens = word_tokenize(text)
    lemma=[]
    for t in tokens:
        if not(t.lower() in stop_words):
            lemma.append(lemmatizer.lemmatize(t.lower()))
    return ' '.join(lemma)

df["lemma1"] = df["sentence1"].apply(tokenize_and_lemmatize)
df["lemma2"] = df["sentence2"].apply(tokenize_and_lemmatize)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
def jaccard(s1, s2):
    w1 = set(s1.split())
    w2 = set(s2.split())
    return len(w1 & w2) / len(w1 | w2) if len(w1 | w2) > 0 else 0
df['jaccard'] = df.apply(
    lambda row: jaccard(row['lemma1'], row['lemma2']),
    axis=1)

def overlap_ratio(s1, s2):
    w1 = set(s1.split())
    w2 = set(s2.split())
    return len(w1 & w2) / min(len(w1), len(w2)) if min(len(w1), len(w2)) > 0 else 0
df['overlap_ratio'] = df.apply(
    lambda row: overlap_ratio(row['lemma1'], row['lemma2']),
    axis=1)

def common_ratio(s1, s2):
    w1 = set(s1.split())
    w2 = set(s2.split())
    common = len(w1 & w2)
    return common / ((len(w1) + len(w2)) / 2) if (len(w1) + len(w2)) / 2 > 0 else 0
df['common_ratio'] = df.apply(
    lambda row: common_ratio(row['lemma1'], row['lemma2']),
    axis=1)

df['len_diff'] = abs(
    df['lemma1'].str.split().str.len() -
    df['lemma2'].str.split().str.len()
)
df['len_ratio'] = (
    df['lemma1'].str.split().str.len() /
    df['lemma2'].str.split().str.len()
).replace([float('nan')], 0).replace([float('inf')], 0)


neg_words = {'no','not','never','none','nothing','nobody','without'}

def neg_count(s):
    return sum(1 for w in s.split() if w in neg_words)

df['neg1'] = df['lemma1'].apply(neg_count)
df['neg2'] = df['lemma2'].apply(neg_count)
df['neg_diff'] = abs(df['neg1'] - df['neg2'])


In [35]:
def map_class(labels):
    if 'contradiction' in labels:
        return 1
    elif 'entailment' in labels:
        return 0
    elif 'neutral' in labels:
        return 2

df['class'] = df['annotator_labels'].apply(map_class)

In [36]:
df.columns

Index(['annotator_labels', 'genre', 'gold_label', 'pairID', 'promptID',
       'sentence1', 'sentence1_binary_parse', 'sentence1_parse', 'sentence2',
       'sentence2_binary_parse', 'sentence2_parse', 'lemma1', 'lemma2',
       'jaccard', 'overlap_ratio', 'common_ratio', 'len_diff', 'len_ratio',
       'neg1', 'neg2', 'neg_diff', 'class'],
      dtype='object')

In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df=5,
    max_df=0.9,
    stop_words='english'
)

X_tfidf_1 = tfidf.fit_transform(df['lemma1'])
X_tfidf_2 = tfidf.transform(df['lemma2'])

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


num_features = [
    'jaccard',
    'overlap_ratio',
    'common_ratio',
    'len_diff',
    'len_ratio',
    'neg1',
    'neg2',
    'neg_diff'
]

X_num = df[num_features].values


X = hstack([ X_tfidf_1, X_tfidf_2,X_num])
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
clf.score(X_test, y_test)


KeyboardInterrupt: 

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


num_features = [
    'jaccard',
    'overlap_ratio',
    'common_ratio',
    'len_diff',
    'len_ratio',
    'neg1',
    'neg2',
    'neg_diff'
]

X = df[num_features]
y = df['class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
clf = LogisticRegression(max_iter=1000).fit(X_train, y_train)
clf.score(X_test, y_test)


0.48030353186829977

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

y_pred = clf.predict(X_test)

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
disp.plot(cmap='Blues')
plt.show()


In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


In [None]:
from sklearn.svm import LinearSVC

svm = LinearSVC(
    C=1.0,
    class_weight='balanced',
    max_iter=5000
)

svm.fit(X_train, y_train)
svm.score(X_test, y_test)
