<a href="https://colab.research.google.com/github/muratkck/nlp-project/blob/main/nlp_clean_german.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers
!pip install -q feedparser
!python3 -m spacy download de_core_news_sm

  Preparing metadata (setup.py) ... [?25l[?25hdone
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.3/81.3 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sgmllib3k (setup.py) ... [?25l[?25hdone
Collecting de-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.7.0/de_core_news_sm-3.7.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all 

# First Step: Dataset Creation

## Why have we used the summary?
- _Summary_ contains complete sentences. So, we think it would be  easier to translate another language.

In [3]:
import feedparser

def feedparser_from_url(url):
    feed = feedparser.parse(url)
    return [entry['summary'] for entry in feed['entries']]

#function for feedparser usage

In [4]:
import spacy

# SpaCy Almanca dil modeli
nlp = spacy.load("de_core_news_sm")

def split_summary_into_sentences(entries):
    """
    Entries içerisindeki Almanca summary'leri cümlelerine böler ve tüm cümleleri tek tek döndürür.
    """
    sentences = []
    for entry in entries:
        # SpaCy'yi kullanarak cümlelere ayır
        doc = nlp(entry)
        sentences.extend([sent.text for sent in doc.sents])
    return sentences

In [12]:
file = open("/content/drive/MyDrive/ceng534/data/rss_german.txt", "r")
text = file.read()
urls = [a for a in text.split()]
entries =[]   # All sentences are here
index = 0

while len(entries) < 2000:
    if index > (len(urls) - 1):
        break
    url = urls[index]
    entry = feedparser_from_url(url)
    # Gelen entry'yi cümlelerine ayır ve tek tek entries'e ekle
    entry_sentences = split_summary_into_sentences(entry)
    entries += entry_sentences  # Cümleler listeye eklenir
    index += 1


import pandas as pd

pd.set_option("display.max_colwidth", None)  # Ensure full content visibility
print(f"Total entries (each one is sentence from summary): {len(pd.DataFrame(entries).drop_duplicates(ignore_index=True))}")
display(pd.DataFrame(entries).drop_duplicates(ignore_index=True))

Total entries (each one is sentence from summary): 2019


Unnamed: 0,0
0,"Vor zwei Tagen war er im Alter von 92 Jahren verstorben, nun wurde der frühere indische Premier Singh eingeäschert."
1,Zehn Jahre lang regierte er das Land.
2,Viele verbinden mit ihm vor allem seine Wirtschaftsreformen und den Aufstieg Indiens.
3,Am Mittwoch stürzte ein aserbaidschanischen Passagierflugzeug ab.
4,Nun hat sich Präsident Putin bei seinem aserbaidschanischen Amtskollegen Aliyev entschuldigt.
...,...
2014,Viele Gesetzesänderungen betreffen auch die Einkommen vieler Bürger.
2015,Wo ist für Sie nach dem Jahreswechsel mehr Geld drin und wo nicht?
2016,Das zeigt eine aktuelle Forsa-Umfrage.
2017,Wladimir Putin scheut den Einsatz von atomaren Streitkräften nicht.


In [17]:
import spacy
import pandas as pd

# Load German SpaCy model
nlp = spacy.load("de_core_news_sm")

# Create a DataFrame and drop duplicates
df = pd.DataFrame(entries, columns=["text"]).drop_duplicates(ignore_index=True)

# Function to filter entries with at least 2 NER tags
def filter_by_ner(dataframe, nlp_model):
    filtered_texts = []
    for text in dataframe["text"]:
        doc = nlp_model(text)
        if len(doc.ents) >= 1:  # Check if there are at least 1 NER tags
            filtered_texts.append(text)
        if len(filtered_texts) >= 900:
            break
    return pd.DataFrame(filtered_texts, columns=["text"])

# Apply the filter
filtered_df = filter_by_ner(df, nlp)

# Display the filtered DataFrame to the user
display(filtered_df)



Unnamed: 0,text
0,"Vor zwei Tagen war er im Alter von 92 Jahren verstorben, nun wurde der frühere indische Premier Singh eingeäschert."
1,Zehn Jahre lang regierte er das Land.
2,Viele verbinden mit ihm vor allem seine Wirtschaftsreformen und den Aufstieg Indiens.
3,Nun hat sich Präsident Putin bei seinem aserbaidschanischen Amtskollegen Aliyev entschuldigt.
4,"Von einem Abschuss sprach Putin nicht - gab aber zu, dass die Flugabwehr aktiv war."
...,...
895,Von Anna Luca Kirchhoff.
896,Die Altersvorsorge dürfte also Wahlkampfthema werden.
897,Mit welchen Renten-Ideen werben die Parteien?
898,Von Hans-Joachim Vieweger.


# Dataset revision
- The texts that have no entities were removed from dataset
- Some short text like _"Von Hans-Joachim Vieweger."_ were removed from dataset.
- Dataset length before removing: **900**
- Dataset length after removing: **799**

In [20]:
updated_df = pd.read_csv("/content/drive/MyDrive/ceng534/data/german_data_updated.csv")
display(updated_df.head())

Unnamed: 0,Text,Label
0,"Vor zwei Tagen war er im Alter von 92 Jahren verstorben, nun wurde der frühere indische Premier Singh eingeäschert.",negative
1,Viele verbinden mit ihm vor allem seine Wirtschaftsreformen und den Aufstieg Indiens.,positive
2,Mit den Großaktionären und Gläubigerbanken einigte sich die BayWa nun auf wichtige Schritte auf dem Weg zur angestrebten Sanierung.,positive
3,"Laut dem Deutschen Wetterdienst sammelt sich sich dadurch Feinstaub an, der für schlechtere Luftqualität sorgt.",negative
4,"Der ukrainische Präsident Selenskyj wirft der Slowakei vor, eine ""zweite Energiefront"" gegen die Ukraine zu eröffnen.",negative


In [37]:
news_texts = updated_df.Text.to_list()
news_labels = updated_df.Label.to_list()[:78] # First 78 data is labeled!
print(news_texts[0])
print(news_labels[77])

Vor zwei Tagen war er im Alter von 92 Jahren verstorben, nun wurde der frühere indische Premier Singh eingeäschert.
positive


# Second Step: Translation

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

translation_model = "facebook/nllb-200-distilled-600M"

translation_pipeline = pipeline("translation", model=translation_model, device="cuda", max_length = 800)


In [None]:
# Example translation
translated_tr = translation_pipeline(entries[0], src_lang = "de_Latn", tgt_lang = "tur_Latn")

# translated = translation_pipeline(entries[0])
print(translated_tr[0]["translation_text"])
print(entries[0])

## Save the translation dataset as CSV

In [None]:
import pandas as pd

# Create a list to store the translations
data = []

# Translate all entries and append to the data list
for entry in entries:
    try:
        # Translate to Turkish
        translated_tr = translation_pipeline(entry, src_lang="de_Latn", tgt_lang="tur_Latn")[0]["translation_text"]

        # Append to the data list
        data.append({"German": entry, "Turkish": translated_tr})
    except Exception as e:
        print(f"Error translating entry: {entry}. Error: {e}")

# Create a DataFrame
df = pd.DataFrame(data)

# Save to CSV
csv_filename = "/content/drive/MyDrive/ceng534/data/summaries_translations.csv"
df.to_csv(csv_filename, index=False, encoding="utf-8")

print(f"Translations saved to {csv_filename}")

In [None]:
pd.set_option("display.max_colwidth", None)  # Ensure full content visibility

display(df.head())

In [None]:
import pandas as pd

pd.set_option("display.max_colwidth", None)  # Ensure full content visibility

news = pd.read_csv("/content/drive/MyDrive/ceng534/data/summaries_translations.csv")
turkish_text = pd.DataFrame(news["Turkish"])
german_text = pd.DataFrame(news["German"])

display(german_text.head())

## BLEU Score Calculation

### Explanations:
- Translation of the Google Translation is used to calculate BLEU score of the Translation model.

In [None]:
!pip install deep-translator
!pip install sacrebleu

In [None]:
from deep_translator import GoogleTranslator
import sacrebleu

In [None]:
def translate_to_turkish_with_google(text):
    try:
        translated_text = GoogleTranslator(source="de", target="tr").translate(text)
        return translated_text
    except Exception as e:
        print(f"Error translating text: {text}. Error: {e}")
        return None

In [None]:
news["google_translation"] = news["German"].apply(translate_to_turkish_with_google)

In [None]:
news.head()

In [None]:
# Calculate BLEU score between HuggingFace-translated Turkish and Google-translated Turkish
huggingface_translations = news["Turkish"].tolist()
google_translations = news["google_translation"].tolist()

# Ensure there are no missing or invalid values
huggingface_translations = [t for t in huggingface_translations if isinstance(t, str)]
google_translations = [t for t in google_translations if isinstance(t, str)]

# Calculate corpus BLEU score
bleu_score = sacrebleu.corpus_bleu(huggingface_translations[:10], [google_translations])
print(f"BLEU Score: {bleu_score.score}")

# Third Step: NER Tagging

## German text

In [None]:
import spacy
import pandas as pd

# Load SpaCy's German language model
nlp = spacy.load("de_core_news_sm")

# Example German text DataFrame (replace with your actual DataFrame)
# german_text = pd.read_csv("path_to_your_file.csv")  # Uncomment if reading from a file
# Assuming german_text is already loaded with a column 'German'

# Function to extract entities from text
def extract_entities(text):
    if pd.isnull(text):  # Handle NaN values
        return []
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Apply the NER function to the German column
german_text['entities'] = german_text['German'].apply(extract_entities)


# Save the results to a CSV file for further analysis (optional)
#german_text.to_csv("labeled_german_text.csv", index=False)
#print("Labeled data saved to 'labeled_german_text.csv'.")


In [None]:
# Print the updated DataFrame
display(german_text.head(15))

## Turkish Text

In [None]:
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
from transformers import AutoModelForSequenceClassification
import torch

In [None]:
def process_ner(df):
    """
    Process Turkish text with NER tagging and fix splitting issues with "##".

    Args:
        df (pandas.DataFrame): DataFrame containing Turkish sentences in 'Turkish' column.

    Returns:
        pandas.DataFrame: DataFrame with added NER tags and tagged sentences.
    """
    # Initialize NER pipeline
    ner_tokenizer = AutoTokenizer.from_pretrained("akdeniz27/bert-base-turkish-cased-ner")
    ner_model = AutoModelForTokenClassification.from_pretrained("akdeniz27/bert-base-turkish-cased-ner")
    ner = pipeline("ner", model=ner_model, tokenizer=ner_tokenizer)

    # Create a copy of the dataframe
    result_df = df.copy()

    # Process each sentence
    ner_results = []

    for sentence in df['Turkish']:
        # Perform NER
        ner_result = ner(sentence)

        # Fix "##" issues by merging subwords
        fixed_ner_result = []
        buffer_word = ""
        buffer_entity = None
        buffer_start = None
        buffer_end = None

        for entity in ner_result:
            word = entity['word']
            if word.startswith("##"):  # Subword detected
                buffer_word += word[2:]  # Append subword to buffer
                buffer_end = entity['end']  # Update end position
            else:
                if buffer_word:  # If there's a buffer, finalize it
                    fixed_ner_result.append({
                        "word": buffer_word,
                        "entity": buffer_entity,
                        "start": buffer_start,
                        "end": buffer_end
                    })
                # Start a new buffer
                buffer_word = word
                buffer_entity = entity['entity']
                buffer_start = entity['start']
                buffer_end = entity['end']

        # Append the last buffer if exists
        if buffer_word:
            fixed_ner_result.append({
                "word": buffer_word,
                "entity": buffer_entity,
                "start": buffer_start,
                "end": buffer_end
            })

        ner_results.append(", ".join([f"{e['word']}: {e['entity']}" for e in fixed_ner_result]) if fixed_ner_result else "No entities found")


    # Add results to dataframe
    result_df['NER_Tags'] = ner_results

    return result_df


In [None]:
# Read your dataset (adjust the path as needed)
df = pd.read_csv('/content/drive/MyDrive/ceng534/data/summaries_translations.csv')

In [None]:
# Process NER
ner_results = process_ner(df)
print("NER processing completed!")
display(ner_results.head(15))  # Display first few rows

In [None]:
def tagged_sentences(df):
    """
    Fix the Tagged_Sentence column to ensure proper NER tagging with prefixes like [B-ORG], [I-ORG], [B-LOC], etc.

    Args:
        df (pandas.DataFrame): DataFrame with NER_Tags and Tagged_Sentence columns.

    Returns:
        pandas.DataFrame: DataFrame with corrected Tagged_Sentence column.
    """
    fixed_sentences = []

    for ner_tags, sentence in zip(df['NER_Tags'], df['Turkish']):
        # Parse NER_Tags and split into entities
        entities = []
        if ner_tags != "No entities found":
            for tag in ner_tags.split(", "):
                word, entity = tag.split(": ")
                entities.append((word, entity))

        # Create a corrected tagged sentence
        corrected_sentence = sentence
        offset = 0  # Track the length adjustment due to inserting tags

        # Sort entities by their original position in the sentence
        for word, entity in sorted(entities, key=lambda x: sentence.find(x[0])):
            start_pos = corrected_sentence.find(word, offset)
            if start_pos != -1:
                # Check if the entity string contains a hyphen
                if "-" in entity:
                    tag_prefix = entity.split("-")[0]
                    tag_type = entity.split("-")[1]
                    tagged_word = f"[{tag_prefix}-{tag_type}:{word}]"  # Use tag_type here
                else:
                    # If no hyphen, assume the whole entity string is the tag_type
                    tagged_word = f"[{entity}:{word}]"
                corrected_sentence = (
                    corrected_sentence[:start_pos]
                    + tagged_word
                    + corrected_sentence[start_pos + len(word):]
                )
                offset += len(tagged_word) - len(word)

        fixed_sentences.append(corrected_sentence)

    df['Tagged_Sentence'] = fixed_sentences
    return df

In [None]:
ner_sentences_df = tagged_sentences(ner_results)
ner_sentences_df.to_csv("/content/drive/MyDrive/ceng534/data/ner_tagged_sentences.csv")
ner_sentences_df.head(15)

# Fourth Step: Sentiment Analysis

#### Models
1. "savasy/bert-base-turkish-sentiment-cased"
2. "Gorengoz/bert-turkish-sentiment-analysis-cased"
3. "saribasmetehan/bert-base-turkish-sentiment-analysis"

## Zero-Shot Sentiment analyze

In [None]:
'''
# Load Turkish Sentiment Analysis model
model1 = "savasy/bert-base-turkish-sentiment-cased"
model2 = "Gorengoz/bert-turkish-sentiment-analysis-cased"
model3 = "saribasmetehan/bert-base-turkish-sentiment-analysis"

sentiment_pipeline = pipeline(
    "sentiment-analysis", model=model2
)
'''

In [None]:
def process_sentiment(df):
    """
    Process tagged Turkish sentences with sentiment analysis

    Args:
        df (pandas.DataFrame): DataFrame containing tagged sentences in 'Tagged_Sentence' column

    Returns:
        pandas.DataFrame: DataFrame with added sentiment analysis results
    """
    # Initialize sentiment analysis pipeline
    sentiment_tokenizer = AutoTokenizer.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
    sentiment_model = AutoModelForSequenceClassification.from_pretrained("savasy/bert-base-turkish-sentiment-cased")
    sentiment = pipeline("sentiment-analysis", model=sentiment_model, tokenizer=sentiment_tokenizer)

    # Create a copy of the dataframe
    result_df = df.copy()

    # Process each tagged sentence
    sentiment_results = []

    for tagged_sentence in df['Tagged_Sentence']:
        # Perform sentiment analysis on the tagged sentence
        sentiment_result = sentiment(tagged_sentence)[0]
        sentiment_results.append({
            'label': sentiment_result['label'],
            'score': round(sentiment_result['score'], 3)
        })

    # Add results to dataframe
    result_df['Sentiment_Label'] = [result['label'] for result in sentiment_results]
    result_df['Sentiment_Score'] = [result['score'] for result in sentiment_results]

    return result_df

In [None]:
# Process Sentiment
final_results = process_sentiment(ner_results)
print("Sentiment analysis completed!")

In [None]:
display(final_results.head(15))  # Display first few rows

In [None]:
# Save results
final_results.to_csv('/content/drive/MyDrive/ceng534/data/zeroshot_sentiment.csv', index=False)
print("Results saved to '/content/drive/MyDrive/ceng534/data/zeroshot_sentiment.csv'")

## Fine-Tuning for Few-shot Sentiment Analysis

In [None]:
!pip install transformers[torch]
!pip install accelerate>=0.26.0
!pip install -q scikit-learn
!pip install datasets

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
def preprocess_dataset(df, model_name="savasy/bert-base-turkish-sentiment-cased"):
    """
    Preprocess the labeled dataset by tokenizing sentences and converting labels to integers.

    Args:
        df (pandas.DataFrame): DataFrame containing 'Tagged_Sentence' and 'Sentiment_Label' columns.
        model_name (str): Pretrained model name for tokenization.

    Returns:
        Dataset: Tokenized HuggingFace Dataset with labels included.
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    # Map sentiment labels to integers
    label_mapping = {"positive": 1, "negative": 0}
    df["Sentiment_Label"] = df["Sentiment_Label"].map(label_mapping)

    # Encode the dataset
    def preprocess_function(examples):
        tokenized = tokenizer(
            examples["Tagged_Sentence"],
            truncation=True,
            padding="max_length",
            max_length=128
        )
        tokenized["label"] = examples["Sentiment_Label"]  # Map the label column
        return tokenized

    # Convert DataFrame to Dataset and preprocess
    dataset = Dataset.from_pandas(df)
    encoded_dataset = dataset.map(preprocess_function, batched=True)

    return encoded_dataset

In [None]:
def split_dataset(encoded_dataset, train_ratio=0.8):
    """
    Split the dataset into training and evaluation datasets.

    Args:
        encoded_dataset (Dataset): Tokenized HuggingFace Dataset.
        train_ratio (float): Ratio of data to use for training (default: 0.8).

    Returns:
        train_dataset, eval_dataset: Training and evaluation datasets.
    """
    train_size = int(train_ratio * len(encoded_dataset))
    train_dataset = encoded_dataset.select(range(train_size))
    eval_dataset = encoded_dataset.select(range(train_size, len(encoded_dataset)))

    return train_dataset, eval_dataset

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np

def compute_metrics(pred):
    """
    Compute accuracy, precision, recall, and F1 score for evaluation.

    Args:
        pred: Predictions from the Trainer.

    Returns:
        dict: Evaluation metrics.
    """

    # Ensure predictions and labels are on the CPU
    predictions = np.argmax(pred.predictions, axis=1)
    labels = pred.label_ids

    # Compute metrics
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="binary")
    acc = accuracy_score(labels, predictions)

    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}


In [None]:
def train_sentiment_model(train_dataset, eval_dataset, model_name="savasy/bert-base-turkish-sentiment-cased"):
    """
    Train a sentiment analysis model with the training dataset.

    Args:
        train_dataset (Dataset): Tokenized training dataset.
        eval_dataset (Dataset): Tokenized evaluation dataset.
        model_name (str): Pretrained model name.

    Returns:
        Trainer: Trained sentiment analysis model trainer.
        AutoModelForSequenceClassification: Trained sentiment analysis model.
    """
    # Load pretrained model
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        evaluation_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        save_strategy="epoch",
        load_best_model_at_end=True
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer.train()

    return trainer, model

- **WANDB API Key:** af9af3bbdb0984924bedd53b1dc95028b250b9f8

In [None]:
import pandas as pd

# Step 1: Preprocess the dataset
model_name = "savasy/bert-base-turkish-sentiment-cased"

labeled_df = pd.read_csv('/content/drive/MyDrive/ceng534/data/zeroshot_sentiment.csv')[["Tagged_Sentence", "Sentiment_Label"]]

encoded_dataset = preprocess_dataset(labeled_df, model_name)

# Step 2: Split the dataset
train_dataset, eval_dataset = split_dataset(encoded_dataset)

print(train_dataset)

# Step 3: Train the model
trainer, trained_model = train_sentiment_model(train_dataset, eval_dataset, model_name)

# Save the trained model
trained_model.save_pretrained("/content/drive/MyDrive/ceng534/trained_models/trained_sentiment_model1")
print("Trained model saved to './trained_sentiment_model1'")