In [None]:
from base_imports import *


In [None]:
# KFold Train-Test on 2012
# Full training on 2012 for predicting 2013

# Load the data
df = pd.read_excel('scraped_and_filtered_datasets\\filtered_resmi_gazete_data_2012.xlsx')
df_2013 = pd.read_excel('scraped_and_filtered_datasets\\filtered_resmi_gazete_data_2013.xlsx')

# Ensure the 'Target' column is of type integer
df['Target'] = df['Target'].astype(int)

# Define a Dataset class for PyTorch
class SubtitleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels) if self.labels is not None else len(self.encodings['input_ids'])

# Use a Turkish BERT model tokenizer
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-uncased')

# Tokenize the input text
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512)

# Tokenize the dataset
train_encodings = tokenize_function(df['Subtitle'].tolist())
train_labels = df['Target'].tolist()

# Create the PyTorch dataset
train_dataset = SubtitleDataset(train_encodings, train_labels)

# Ensure the output directory is a valid directory
output_dir = './results'
if os.path.exists(output_dir) and os.path.isfile(output_dir):
    os.remove(output_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Define the model and training arguments
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir=output_dir,  # Use the directory path here
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

# Perform 10-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)
accuracy_scores = []
f1_scores = []

for train_index, val_index in kf.split(df):
    train_texts, val_texts = df['Subtitle'].iloc[train_index].tolist(), df['Subtitle'].iloc[val_index].tolist()
    train_labels, val_labels = df['Target'].iloc[train_index].tolist(), df['Target'].iloc[val_index].tolist()

    train_encodings = tokenize_function(train_texts)
    val_encodings = tokenize_function(val_texts)

    train_dataset = SubtitleDataset(train_encodings, train_labels)
    val_dataset = SubtitleDataset(val_encodings, val_labels)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=lambda p: {
            'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1)),
            'f1': f1_score(p.label_ids, p.predictions.argmax(-1), average='weighted')
        }
    )

    trainer.train()
    metrics = trainer.evaluate()
    accuracy_scores.append(metrics['eval_accuracy'])
    f1_scores.append(metrics['eval_f1'])

print("10-Fold Cross-Validation Results:")
print("Average Accuracy: ", sum(accuracy_scores) / len(accuracy_scores))
print("Average F1 Score: ", sum(f1_scores) / len(f1_scores))

# Train on the full 2012 dataset
train_encodings = tokenize_function(df['Subtitle'].tolist())
train_labels = df['Target'].tolist()

train_dataset = SubtitleDataset(train_encodings, train_labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

trainer.train()

# Predict on the 2013 data
test_encodings = tokenize_function(df_2013['Subtitle'].tolist())
test_dataset = SubtitleDataset(test_encodings)

predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

df_2013['Predicted'] = predicted_labels

# Save the predictions
df_2013.to_excel('predicted_resmi_gazete_data_2013.xlsx', index=False)


In [None]:
# Merge validated datasets

# Load the 2012 and 2013 validated data
df_2012 = pd.read_excel('validation_and_falsepred_datasets\\validated_resmi_gazete_data_2012.xlsx', usecols=["Date", "Category", "Subtitle", "Target"])
df_2013 = pd.read_excel('validation_and_falsepred_datasets\\validated_resmi_gazete_data_2013.xlsx', usecols=["Date", "Category", "Subtitle", "Target"])

# Merge the DataFrames
merged_df = pd.concat([df_2012, df_2013], ignore_index=True)

# Save the merged DataFrame to a new Excel file
merged_df.to_excel('merged_validated_resmi_gazete_data_2012_2013.xlsx', index=False)

In [None]:
## Train BERT model with 2012 and 2013 annotated dataset.
# Predict filtered data from 2006 to 2024, excluded 2012 and 2013
# Use 47 samples for testing 5029 entries of from 2006 data.
# Store samples, then validate 4 of them for evaluation.
# See Evaluation notebook for metrics.


# Load the annotated data
annotated_df = pd.read_excel('validation_and_falsepred_datasets\\merged_validated_resmi_gazete_data_2012_2013.xlsx')

# Load the unannotated data
unannotated_df = pd.read_excel('scraped_and_filtered_datasets\\filtered_resmi_gazete_data_from2006_not12_not13.xlsx')

# Ensure reproducibility
unannotated_df = unannotated_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into 47 random samples
samples = np.array_split(unannotated_df, 47)

# Ensure the 'Target' column is of type integer
annotated_df['Target'] = annotated_df['Target'].astype(int)


# Prepare the data for training

# Define a Dataset class for PyTorch
class SubtitleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels) if self.labels is not None else len(self.encodings['input_ids'])

# Use a Turkish BERT model tokenizer
tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-turkish-uncased')

# Tokenize the input text
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512)

# Tokenize the dataset
train_encodings = tokenize_function(annotated_df['Subtitle'].tolist())
train_labels = annotated_df['Target'].tolist()

# Create the PyTorch dataset
train_dataset = SubtitleDataset(train_encodings, train_labels)

# Train the model

# Define the model and training arguments
model = BertForSequenceClassification.from_pretrained('dbmdz/bert-base-turkish-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1)),
        'f1': f1_score(p.label_ids, p.predictions.argmax(-1), average='weighted')
    }
)

trainer.train()



#Predict and save the results

# Function to predict and save each sample
def predict_and_save(sample, sample_num):
    # Ensure all subtitles are strings
    sample['Subtitle'] = sample['Subtitle'].astype(str)

    sample_encodings = tokenize_function(sample['Subtitle'].tolist())
    sample_dataset = SubtitleDataset(sample_encodings)

    predictions = trainer.predict(sample_dataset)
    predicted_labels = predictions.predictions.argmax(-1)

    sample['Predicted'] = predicted_labels

    # Save the predictions to a new Excel file
    sample.to_excel(f'sample{sample_num}_prediction_2006_to_2024_not12_not13.xlsx', index=False)

# Predict and save each sample
for i, sample in enumerate(samples):
    predict_and_save(sample, i + 1)

