In [None]:
import pandas as pd
import numpy as np

In [None]:
!pip install transformers

In [None]:
!pip install sentencepiece

In [None]:
!pip install accelerate -U

In [None]:
!pip install transformers[torch]

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("nlp-waseda/roberta-large-japanese-seq512")
model = AutoModelForSequenceClassification.from_pretrained("nlp-waseda/roberta-large-japanese-seq512", num_labels=2)


In [None]:
!pip install datasets

In [None]:
from transformers import DataCollatorWithPadding

# URLs of the raw CSV files on GitHub
url_train = 'https://raw.githubusercontent.com/masauppsala/Synthetic-Japanese-Data-by-GPT-4/main/train_outdomain_synth_latest.csv'
url_eval = 'https://raw.githubusercontent.com/masauppsala/Synthetic-Japanese-Data-by-GPT-4/main/eval_outdomain_synth_latest.csv'

# Loading the datasets directly from GitHub into pandas DataFrames
df_train = pd.read_csv(url_train)
df_eval = pd.read_csv(url_eval)


from sklearn.utils import shuffle
df_train = shuffle(df_train, random_state=42).reset_index(drop=True)
df_eval = shuffle(df_eval, random_state=42).reset_index(drop=True)



from datasets import Dataset
def tokenize_function(examples):
    return tokenizer(examples['sentence'], padding='max_length', truncation=True, max_length=128)

# Convert DataFrame to Dataset
dataset_train = Dataset.from_pandas(df_train)
dataset_eval = Dataset.from_pandas(df_eval)


# Tokenize the sentences
tokenized_train = dataset_train.map(tokenize_function, batched=True)
tokenized_eval = dataset_eval.map(tokenize_function, batched=True)


def format_dataset(dataset):
    return dataset.map(lambda examples: {'labels': examples['label']}, batched=True)

# Format the datasets
formatted_train_dataset = format_dataset(tokenized_train)
formatted_eval_dataset = format_dataset(tokenized_eval)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [None]:
from collections import Counter

# Extract labels from the formatted dataset
train_labels = formatted_train_dataset["labels"]

# Count the occurrences of each label
label_counts = Counter(train_labels)

# Print the counts
print(label_counts)


In [None]:
# Calculate the total number of instances in the training dataset
total_instances = len(train_labels)

# Compute the weight for class 0 by taking the inverse of its count, scaling it by the total number of instances, and normalizing
weight_for_0 = (1 / label_counts[0]) * (total_instances) / 2.0

# Compute the weight for class 1 similarly to class 0
weight_for_1 = (1 / label_counts[1]) * (total_instances) / 2.0

# Create a dictionary to store the computed class weights
class_weights = {0: weight_for_0, 1: weight_for_1}

# Print the dictionary containing class weights to verify the calculations
print(class_weights)

In [None]:
from transformers import Trainer
import torch

# Define training arguments

training_args = TrainingArguments(
    learning_rate=5e-5,
    output_dir='./results',
    num_train_epochs=10,  # Reduced from 5 to prevent overfitting
    per_device_train_batch_size=64,
    per_device_eval_batch_size=32,
    warmup_steps=300,
    weight_decay=0.02,  # Increased for regularization
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    save_total_limit=1,
    lr_scheduler_type='linear',
    fp16=True
)

# Define a custom Trainer class to use our specific loss function incorporating class weights
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([class_weights[0], class_weights[1]], device=logits.device))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

# Create a Custom Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=formatted_train_dataset,
    eval_dataset=formatted_eval_dataset,
    data_collator=data_collator,
)

# Train the model
trainer.train()


In [None]:
from datasets import Dataset, load_metric
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score, matthews_corrcoef

url_test = 'https://raw.githubusercontent.com/masauppsala/Synthetic-Japanese-Data-by-GPT-4/main/originaltestcorpora.csv'

df_test = pd.read_csv(url_test, delimiter=',')

dataset_test = Dataset.from_pandas(df_test)

tokenized_test = dataset_test.map(tokenize_function, batched=True)

# Convert to pandas DataFrame for analysis
formatted_test_dataset = tokenized_test.to_pandas()

# List of syntactic phenomena to analyze
syntactic_phenomena = ['simple', 'Arg. Str.', 'ellipsis', 'filler-gap', 'control/raising', 'island effects', 'NPI/NCI', 'verbal agr.', 'binding', 'morphology', 'nominal structure', 'quantifier']

# Results list to store metrics for each phenomenon
results = []

def get_predictions_for_subset(subset_df):
    # Convert the subset DataFrame back to a Dataset for prediction
    formatted_subset_dataset = Dataset.from_pandas(subset_df)

    # Get predictions using your model's method
    predictions = trainer.predict(formatted_subset_dataset)
    pred_labels = np.argmax(predictions.predictions, axis=1)

    return pred_labels, predictions.label_ids

# For overall accuracy and MCC
overall_pred_labels, overall_true_labels = get_predictions_for_subset(formatted_test_dataset)
overall_accuracy = accuracy_score(overall_true_labels, overall_pred_labels)
overall_mcc = matthews_corrcoef(overall_true_labels, overall_pred_labels)

for phenomenon in syntactic_phenomena:
    if phenomenon in formatted_test_dataset.columns:
        filtered_dataset = formatted_test_dataset[formatted_test_dataset[phenomenon] == True]
        pred_labels, true_labels = get_predictions_for_subset(filtered_dataset)

        # Calculate and store accuracy
        accuracy = accuracy_score(true_labels, pred_labels)

        # Calculate and store MCC
        mcc = matthews_corrcoef(true_labels, pred_labels)

        # Append results
        results.append({
            'Phenomenon': phenomenon,
            'Accuracy': accuracy,
            'MCC': mcc
        })

# Add overall accuracy and MCC to the results
results.append({
    'Phenomenon': 'Overall',
    'Accuracy': overall_accuracy,
    'MCC': overall_mcc
})

# Create DataFrame and save as CSV
results_df = pd.DataFrame(results)
print(results_df)
results_df.to_csv('analysis_results.csv', index=False)


In [None]:
from datasets import Dataset, load_metric
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

# List of syntactic phenomena to analyze
syntactic_phenomena = ['simple', 'Arg. Str.', 'ellipsis', 'filler-gap', 'control/raising', 'island effects', 'NPI/NCI', 'verbal agr.', 'binding', 'morphology', 'nominal structure', 'quantifier']

# Results list to store metrics for each phenomenon
results = []

def get_predictions_for_subset(subset_df):
    # Convert the subset DataFrame back to a Dataset for prediction
    formatted_subset_dataset = Dataset.from_pandas(subset_df)

    # Get predictions
    predictions = trainer.predict(formatted_subset_dataset)
    pred_labels = np.argmax(predictions.predictions, axis=1)

    return pred_labels, predictions.label_ids

for phenomenon in syntactic_phenomena:
    if phenomenon in formatted_test_dataset.columns:
        filtered_dataset = formatted_test_dataset[formatted_test_dataset[phenomenon] == True]
        pred_labels, true_labels = get_predictions_for_subset(filtered_dataset)

        # Get the full classification report
        print(f"Classification Report for {phenomenon}:")
        class_report = classification_report(true_labels, pred_labels)
        print(class_report)

        # Extract metrics from the report dictionary
        class_report_dict = classification_report(true_labels, pred_labels, output_dict=True)
        accuracy = class_report_dict['accuracy']
        recallmacro = class_report_dict['macro avg']['recall']  # Adjust as needed
        f1macro = class_report_dict['macro avg']['f1-score']
        recallweighted = class_report_dict['weighted avg']['recall']  # Adjust as needed
        f1weighted = class_report_dict['weighted avg']['f1-score']
           # Adjust as needed

        roc_auc = "N/A"
        if len(set(true_labels)) == 2:
            roc_auc = roc_auc_score(true_labels, pred_labels)

        # Append results
        results.append({
            'Phenomenon': phenomenon,
            'Accuracy': accuracy,
            'Recall(macro)': recallmacro,
            'Recall(weighted)': recallweighted,
            'F1-Score(macro)': f1macro,
            'F1-Score(weighted)': f1weighted,
            'ROC AUC': roc_auc
        })

# Create DataFrame and save as CSV
results_df = pd.DataFrame(results)
results_df.to_csv('analysis_results.csv', index=False)


In [None]:
from datasets import Dataset
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, roc_auc_score

# Obtain overall predictions for the entire formatted_test_dataset
overall_pred_labels, overall_true_labels = get_predictions_for_subset(formatted_test_dataset)

# Calculate overall metrics and get it as a dictionary
overall_report_dict = classification_report(overall_true_labels, overall_pred_labels, output_dict=True)

# Convert dictionary to DataFrame
overall_report_df = pd.DataFrame(overall_report_dict).transpose()

print(overall_report_df)
# Save DataFrame to CSV
overall_report_df.to_csv('overall_performance_report.csv', index=True)