In [None]:

import pandas as pd
import numpy as np
from tqdm import tqdm
import time

#  correct file path and extension
file_path = "updated_data.json"

# Read the JSON file into a DataFrame
df = pd.read_json(file_path, lines=True)

In [None]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score




In [None]:
#a smaller, random sample (1% of the original) from the DataFrame
df_sampled = df.sample(frac=0.002, random_state=42)

In [None]:
df_sampled.shape

(6372, 6)

In [None]:
df_sampled['class'].value_counts()

class
1    4574
0    1798
Name: count, dtype: int64

In [None]:

print(df_sampled['class'].value_counts())

# Separate the data into two classes
class_0_data = df_sampled[df_sampled['class'] == 0]
class_1_data = df_sampled[df_sampled['class'] == 1]

# Downsample Class 1 to have the same number of samples as Class 0
class_1_downsampled = class_1_data.sample(n=len(class_0_data), replace=False, random_state=42)

# Concatenate the downsampled Class 1 with the original Class 0
balanced_df = pd.concat([class_0_data, class_1_downsampled])

# Display the counts of each class after downsampling
print(balanced_df['class'].value_counts())


class
1    4574
0    1798
Name: count, dtype: int64
class
0    1798
1    1798
Name: count, dtype: int64


In [None]:
# Rename the balanced DataFrame to df_sampled
df_sampled = balanced_df

# Display the counts of each class in the final sampled DataFrame
print(df_sampled['class'].value_counts())


class
0    1798
1    1798
Name: count, dtype: int64


In [None]:
df_sampled1 =

In [None]:
review_train, review_test, class_train, class_test = train_test_split(df_sampled['reviewText'],df_sampled['class'],test_size = 0.35)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split


In [None]:
# Use TfidfVectorizer to convert text data into numerical vectors
vectorizer = TfidfVectorizer(max_features=1000)  # Adjust max_features as needed
X_train_vectorized = vectorizer.fit_transform(review_train)


In [None]:

# Use SMOTE to oversample the minority class on training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectorized, class_train)

In [None]:
df_sampled['class'].value_counts()

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:

# Load and split your data
review_train, review_val, class_train, class_val = train_test_split(df_sampled['reviewText'], df_sampled['class'], test_size=0.2)


In [None]:
# Tokenize data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
tokens_train = tokenizer(review_train.tolist(), padding=True, truncation=True, return_tensors='pt')
tokens_val = tokenizer(review_val.tolist(), padding=True, truncation=True, return_tensors='pt')

# Create DataLoader

In [None]:
# Create DataLoader
dataset_train = TensorDataset(tokens_train['input_ids'], tokens_train['attention_mask'], torch.tensor(class_train.tolist()))
dataset_val = TensorDataset(tokens_val['input_ids'], tokens_val['attention_mask'], torch.tensor(class_val.tolist()))

dataloader_train = DataLoader(dataset_train, batch_size=16, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=16, shuffle=False)


In [None]:

# Load BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)


In [None]:

# Set up optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:
# Define early stopping parameters
patience = 3  # Number of epochs to wait for improvement
best_val_loss = float('inf')
current_patience = 0


In [None]:
# Training loop with early stopping
epochs = 2
for epoch in range(epochs):
    model.train()
    for batch in tqdm(dataloader_train, desc=f'Epoch {epoch + 1}/{epochs}', unit='batch'):
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [None]:
# Validate the model on the validation set
model.eval()  # Set the model to evaluation mode

with torch.no_grad():
    val_losses = []  # List to store validation losses
    predicted_classes = []  # List to store predicted classes

    # Iterate over batches in the validation dataloader
    for val_batch in dataloader_val:
        val_input_ids, val_attention_mask, val_labels = val_batch

        # Forward pass (making predictions) without calculating gradients
        val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)

        # Collect validation losses
        val_losses.append(val_outputs.loss.item())

        # Extract logits from the model's output and get predicted classes
        logits = val_outputs.logits
        predicted_class = torch.argmax(logits, dim=1).numpy()
        predicted_classes.extend(predicted_class)


In [None]:
  # Calculate validation accuracy and loss
val_loss = sum(val_losses) / len(val_losses)
val_accuracy = accuracy_score(class_val, predicted_classes)

print(f'Epoch {epoch + 1}/{epochs} - Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.2%}')



Epoch 3/3 - Validation Loss: 0.2771, Accuracy: 90.98%


In [None]:
# Example loop structure
for epoch in range(epochs):
    # ... (previous code)

    # Check for improvement in validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        current_patience = 0
    else:
        current_patience += 1
        if current_patience >= patience:
            print(f'Early stopping after {epoch + 1} epochs without improvement.')
            break



In [None]:
model.save_pretrained('new model01')


In [None]:
from sklearn.metrics import accuracy_score, precision_score

# Validate the model on the validation set
model.eval()
with torch.no_grad():
    predicted_classes = []

    for val_batch in dataloader_val:
        val_input_ids, val_attention_mask, val_labels = val_batch
        val_outputs = model(val_input_ids, attention_mask=val_attention_mask, labels=val_labels)

        logits = val_outputs.logits
        predicted_class = torch.argmax(logits, dim=1).numpy()
        predicted_classes.extend(predicted_class)

    # Calculate validation accuracy and precision
    val_accuracy = accuracy_score(class_val, predicted_classes)
    val_precision = precision_score(class_val, predicted_classes)

    print(f'Validation Accuracy: {val_accuracy:.2%}')
    print(f'Validation Precision: {val_precision:.2%}')


Validation Accuracy: 90.98%
Validation Precision: 92.68%
