In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset1/data.csv


In [2]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, random_split
from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split

In [4]:
# Définir l'appareil (GPU si disponible)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Charger le dataset
df = pd.read_csv("/kaggle/input/dataset1/data.csv")
df = df[['text', 'label']]

In [5]:
# Charger SBERT
sbert_model = SentenceTransformer("all-MiniLM-L6-v2").to(device)

# Encoder les textes
embeddings = sbert_model.encode(df['text'].tolist(), convert_to_tensor=True).to(device)
labels = torch.tensor(df['label'].values, dtype=torch.long).to(device)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1404 [00:00<?, ?it/s]

In [6]:
# Diviser les données en train et test
train_embeddings, test_embeddings, train_labels, test_labels = train_test_split(
    embeddings, labels, test_size=0.2, random_state=42)

In [7]:
# Créer des DataLoaders
train_dataset = TensorDataset(train_embeddings, train_labels)
test_dataset = TensorDataset(test_embeddings, test_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [8]:
# Définir le modèle SBERT + Classifieur
class SBERTClassifier(nn.Module):
    def __init__(self, input_dim=384, num_classes=2):
        super(SBERTClassifier, self).__init__()
        self.fc = nn.Linear(input_dim, num_classes)  # Couche de classification

    def forward(self, x):
        return self.fc(x)


In [9]:
# Initialiser le modèle
model = SBERTClassifier().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, train_loader, criterion, optimizer, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")


In [10]:
# Entraîner le modèle
train_model(model, train_loader, criterion, optimizer)

Epoch 1, Loss: 0.4050
Epoch 2, Loss: 0.2579
Epoch 3, Loss: 0.2124
Epoch 4, Loss: 0.1870
Epoch 5, Loss: 0.1704
Epoch 6, Loss: 0.1586
Epoch 7, Loss: 0.1497
Epoch 8, Loss: 0.1429
Epoch 9, Loss: 0.1375
Epoch 10, Loss: 0.1328


In [11]:
def evaluate_model(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in test_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy: {100 * correct / total:.2f}%')

# Évaluer le modèle
evaluate_model(model, test_loader)

Accuracy: 95.32%


In [13]:
test_samples = [
    ("The government’s new plan to reduce carbon emissions focuses on expanding renewable energy sources, such as solar and wind. The goal is to achieve net-zero emissions by 2050, with the immediate implementation of stricter emissions standards for industries and transportation. Experts are optimistic that these measures will significantly reduce the carbon footprint, but some are skeptical about the feasibility of achieving such ambitious targets.", 1),
    
    ("A recent study published in a leading medical journal has revealed that excessive screen time is causing a surge in eye problems among children. The study, which followed 5,000 children over the course of three years, found a direct correlation between time spent on electronic devices and the development of nearsightedness. This has raised concerns among parents and healthcare professionals alike, with calls for schools to limit screen usage during lessons.", 1),
    
    ("The stock market has been extremely volatile in recent months, with investors reacting to a range of global events, including political unrest, economic uncertainty, and the ongoing COVID-19 pandemic. Despite some analysts predicting a recovery in the near future, others believe that the market could continue to fluctuate for months or even years. Some experts are advising caution, warning that it may take longer for the market to stabilize.", 1),
    
    ("A new law passed in the state of California mandates that all new buildings must meet stringent energy efficiency standards. The law, which went into effect last year, requires that all new homes, schools, and commercial buildings be equipped with energy-saving technology, such as solar panels and high-efficiency appliances. Supporters of the law say it will significantly reduce energy consumption, while critics argue that it will raise construction costs.", 1),
    
    ("In an interview last week, the CEO of a major tech company claimed that artificial intelligence will soon be capable of performing most human tasks. The CEO stated that AI could revolutionize industries ranging from healthcare to finance by automating processes and making more accurate predictions. However, many critics are concerned that this technological shift could lead to widespread job losses and a growing divide between skilled and unskilled workers.", 0),
    
    ("A recent investigation revealed that several major corporations have been using tax loopholes to avoid paying billions of dollars in taxes. The investigation, which was conducted by a group of investigative journalists, uncovered evidence that companies in the tech, retail, and energy sectors have been funneling profits to offshore accounts in order to reduce their tax liability. Lawmakers have called for increased scrutiny and tighter regulations to prevent such practices from continuing.", 0),
    
    ("In a shocking move, the president announced that the country will withdraw from the Paris Climate Agreement, citing the economic burden of complying with the treaty. The decision has been met with widespread criticism from environmental groups and international leaders, who argue that pulling out of the agreement will exacerbate global warming and damage the country’s reputation on the world stage. Supporters of the decision, however, argue that it will help protect jobs in the fossil fuel industry.", 0),
    
    ("A new report suggests that the global economy is on the brink of a major recession, driven by a combination of factors, including trade wars, rising debt levels, and declining consumer confidence. The report warns that without immediate intervention, many countries could face years of economic stagnation, with significant impacts on employment, public services, and social stability. Governments are being urged to take swift action to mitigate the effects of the downturn.", 1),
    
    ("Researchers at a leading university have developed a new drug that could potentially revolutionize the treatment of Alzheimer’s disease. The drug, which has shown promising results in early clinical trials, works by targeting the underlying causes of the disease, rather than simply alleviating symptoms. While the breakthrough has been hailed as a major scientific achievement, experts caution that further testing is needed before it can be widely available to patients.", 1),
    
    ("In recent months, there has been an alarming rise in cyberattacks targeting government agencies and large corporations. Experts attribute this increase to the growing sophistication of hacking groups, who are using advanced techniques to breach security systems and steal sensitive information. The attacks have caused significant financial and reputational damage to the affected organizations, prompting calls for stronger cybersecurity measures at both the national and corporate levels.", 0),
    
    # 10 additional examples
    ("New research shows that plant-based diets can significantly improve cardiovascular health. A study conducted over five years found that individuals who switched to a plant-based diet had lower cholesterol levels, improved blood pressure, and reduced the risk of heart disease. While some critics argue that more research is needed to confirm these results, health experts are encouraging people to adopt more plant-based foods in their diet.", 1),
    
    ("An undercover investigation has revealed that some fast-food chains are using unhealthy cooking oils that are linked to a higher risk of heart disease and diabetes. The oils, which contain high levels of trans fats, have been shown to raise bad cholesterol levels and increase the likelihood of developing chronic health conditions. Activists are calling for stricter regulations on food labeling to ensure that consumers are aware of the risks.", 0),
    
    ("A new social media platform has gained massive popularity, particularly among younger generations, by offering a unique blend of entertainment and social networking. Users can share short videos, photos, and live streams, while also interacting with friends and celebrities. Despite its success, critics warn that the platform could contribute to mental health issues, including anxiety and depression, by promoting unrealistic beauty standards and fostering online bullying.", 0),
    
    ("Scientists have made a major breakthrough in renewable energy technology, developing a new type of solar panel that is far more efficient than existing models. The new panels are capable of converting sunlight into electricity with an efficiency rate of over 40%, which is more than double the efficiency of current technology. If successfully scaled, this innovation could make solar power a more viable alternative to fossil fuels in the coming decades.", 1),
    
    ("A report published by the United Nations highlights the devastating effects of deforestation in the Amazon rainforest, which is being cleared at an alarming rate. The destruction of this vital ecosystem not only threatens countless species of plants and animals but also contributes significantly to global warming by releasing vast amounts of stored carbon dioxide into the atmosphere. Environmentalists are calling for stronger protections and more sustainable practices to preserve the forest.", 1),
    
    ("The World Health Organization has issued a new set of guidelines recommending that people limit their intake of processed meats to reduce the risk of cancer. Studies have shown a strong link between the consumption of processed meats and an increased risk of colorectal cancer, prompting health experts to urge individuals to avoid foods such as sausages, hot dogs, and deli meats. While some argue that the guidelines are too restrictive, others support the recommendations for healthier eating habits.", 1),
    
    ("A recent exposé revealed that a popular online retailer has been exploiting workers in its warehouses, forcing them to work long hours under harsh conditions for low wages. Employees have reported being subjected to unreasonable performance quotas, unsafe working conditions, and limited access to breaks. The retailer has denied the allegations, but labor unions are pushing for reforms to protect workers' rights.", 0),
    
    ("A new documentary highlights the growing problem of plastic pollution in the world’s oceans, which is causing severe harm to marine life. The film shows the devastating effects of plastic waste on sea creatures, including turtles, whales, and fish, and calls for immediate action to reduce plastic consumption and improve waste management practices. Environmentalists hope the film will raise awareness and inspire people to take action against plastic pollution.", 1),
    
    ("The latest economic report suggests that the U.S. job market is showing signs of slowing down, with a significant decrease in job openings and a rise in unemployment claims. Analysts believe that this could be the result of ongoing trade disputes, rising tariffs, and a slowing global economy. While some experts argue that the situation could improve, others are concerned about a prolonged period of stagnation in the labor market.", 0),
      
    ("Scientists have discovered a new exoplanet that has striking similarities to Earth. The planet, located in a distant star system, has an atmosphere rich in oxygen and liquid water on its surface. Astronomers believe this could be a breakthrough in the search for extraterrestrial life, but caution that further research is needed to confirm the planet’s habitability.", 1),
    
    ("A recent leak suggests that a major tech company has been secretly collecting user data without consent. Internal documents reveal that millions of users had their personal information stored on private servers, raising concerns about privacy and security. The company has yet to issue an official statement regarding the allegations.", 0),
    
    ("The national soccer team has advanced to the finals of the world championship after a stunning victory in the semi-finals. Fans across the country are celebrating the achievement, with experts praising the team’s strategy and performance. The final match is expected to draw record-breaking viewership as the nation rallies behind its players.", 1),
    
    ("A controversial study claims that drinking coffee can extend human lifespan by up to 10 years. The study, conducted by an independent research group, analyzed the health records of over 100,000 individuals. However, many scientists have criticized the methodology, arguing that other lifestyle factors were not properly accounted for.", 0),
    
    ("New regulations introduced by the government will require social media platforms to take stronger action against misinformation. The law, set to take effect next year, mandates that platforms remove false information within 24 hours or face heavy fines. Critics argue that this could lead to censorship, while supporters say it is necessary to combat the spread of fake news.", 1),
    
    ("A major pharmaceutical company has been accused of falsifying clinical trial data to push an ineffective drug onto the market. Whistleblowers claim that key safety concerns were hidden from regulators, leading to thousands of prescriptions being written based on misleading information. Authorities have launched an investigation into the matter.", 0),
    
    ("The latest advancements in artificial intelligence have allowed robots to perform complex surgical procedures with near-perfect accuracy. In a groundbreaking experiment, a robotic system successfully completed a heart surgery without human intervention. Experts believe this technology could revolutionize the medical field and significantly improve patient outcomes.", 1),
    
    ("A popular conspiracy theorist has claimed that the moon is actually an artificial structure built by aliens. According to the theory, NASA has been hiding the truth for decades to prevent public panic. Scientists have dismissed the claim as baseless, pointing out the extensive evidence supporting the natural formation of the moon.", 0),
    
    ("Following years of decline, endangered panda populations have increased significantly thanks to conservation efforts. Wildlife organizations credit habitat protection programs and breeding initiatives for the success. This marks a major victory for conservationists, who hope similar methods can be applied to other endangered species.", 1),
    
    ("An online hoax has been circulating, claiming that a well-known celebrity has passed away. Despite the false reports, the celebrity’s representatives have confirmed that they are alive and well. Experts warn that such hoaxes can cause unnecessary panic and stress, urging people to verify news from reliable sources before sharing.", 0)
]



def test_model(model, test_samples, sbert_model, device):
    model.eval()
    correct = 0
    total = len(test_samples)

    texts, true_labels = zip(*test_samples)
    
    # Encode the texts with SBERT and move them to the appropriate device
    embeddings = sbert_model.encode(list(texts), convert_to_tensor=True).to(device)
    true_labels = torch.tensor(true_labels, dtype=torch.long, device=device)

    with torch.no_grad():
        outputs = model(embeddings)
        _, predicted_labels = torch.max(outputs, 1)

    # Calculate accuracy
    correct = (predicted_labels == true_labels).sum().item()
    accuracy = 100 * correct / total

    print(f"Test Accuracy: {accuracy:.2f}%")
    for i, (text, true_label) in enumerate(test_samples):
        print(f"Sample {i+1}:")
        print(f"Predicted: {predicted_labels[i].item()} | Actual: {true_label}")
        print("-" * 50)

# Run the test function
test_model(model, test_samples, sbert_model, device)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Test Accuracy: 37.93%
Sample 1:
Predicted: 0 | Actual: 1
--------------------------------------------------
Sample 2:
Predicted: 1 | Actual: 1
--------------------------------------------------
Sample 3:
Predicted: 1 | Actual: 1
--------------------------------------------------
Sample 4:
Predicted: 1 | Actual: 1
--------------------------------------------------
Sample 5:
Predicted: 0 | Actual: 0
--------------------------------------------------
Sample 6:
Predicted: 1 | Actual: 0
--------------------------------------------------
Sample 7:
Predicted: 1 | Actual: 0
--------------------------------------------------
Sample 8:
Predicted: 0 | Actual: 1
--------------------------------------------------
Sample 9:
Predicted: 1 | Actual: 1
--------------------------------------------------
Sample 10:
Predicted: 0 | Actual: 0
--------------------------------------------------
Sample 11:
Predicted: 1 | Actual: 1
--------------------------------------------------
Sample 12:
Predicted: 1 | Actu