In [6]:
import json

with open("data/merged.json", "r", encoding="utf-8") as f:
    data = json.load(f)


In [7]:
TRAIN_DATA = []

for item in data:
    text = item["text"]
    entities = []

    for ent in item["entities"]:
        start = ent["start_offset"]
        end = ent["end_offset"]
        label = ent["label"]
        entities.append((start, end, label))

    TRAIN_DATA.append((text, {"entities": entities}))


In [8]:
import spacy
from spacy.training import Example
import re

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def adjust_entity_offsets(text, original_text, entities):
    adjusted_entities = []
    original_words = original_text.split()
    processed_words = text.split()

    offset_map = {}
    original_idx = 0
    for word in processed_words:
        while original_idx < len(original_words) and original_words[original_idx].lower() != word:
            original_idx += 1
        if original_idx < len(original_words):
            offset_map[original_idx] = len(' '.join(processed_words[:processed_words.index(word)]))
            original_idx += 1

    for start, end, label in entities:
        new_start = offset_map.get(start)
        new_end = offset_map.get(end-1) + len(original_words[end-1]) if end-1 in offset_map else end
        if new_start is not None and new_end is not None:
            adjusted_entities.append((new_start, new_end, label))

    return adjusted_entities

def preprocess_data(data):
    preprocessed_data = []
    for text, annotations in data:
        preprocessed_text = preprocess_text(text)
        adjusted_entities = adjust_entity_offsets(preprocessed_text, text, annotations.get("entities"))
        filtered_entities = filter_overlapping_entities(adjusted_entities)
        preprocessed_data.append((preprocessed_text, {"entities": filtered_entities}))
    return preprocessed_data

def filter_overlapping_entities(entities):
    entities = sorted(entities, key=lambda x: x[0]) 
    filtered_entities = []
    last_end = -1

    for start, end, label in entities:
        if start >= last_end:
            filtered_entities.append((start, end, label))
            last_end = end

    return filtered_entities

TRAIN_DATA = preprocess_data(TRAIN_DATA)
nlp = spacy.blank("tr")
ner = nlp.add_pipe("ner")


for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

optimizer = nlp.begin_training()

for i in range(10):
    losses = {}
    for text, annotations in TRAIN_DATA:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], losses=losses, drop=0.5, sgd=optimizer)
    print(losses)




{'ner': 469.71398532405635}
{'ner': 82.05353517820689}
{'ner': 72.85890368044286}
{'ner': 76.08226929968654}
{'ner': 75.654324968596}
{'ner': 54.67932655330826}
{'ner': 71.34317894768604}
{'ner': 62.58624419669633}
{'ner': 70.93654543873254}
{'ner': 70.46991727399815}


In [9]:
nlp.to_disk("ner_model")


In [10]:
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

nlp = spacy.load("ner_model")
analyzer = SentimentIntensityAnalyzer()

text = "Turkcell akıllı cihaz eski tip olan t40 ve t50 yi kullandim ve t50 yi kullaniyorum hala gayet de iyi bi telefon t50 nin arka kapak yokdu sabit bir kasasi vardi ariza durumynda tlf cope gidiyordu tek sorun buydu yeni modellerde bunu ortadan kaldirmislar gordugum kadariyla seyyar kapak yok ise tavsiye etmiyorum geri kalan bu fiyatta normal ozellikler"
doc = nlp(text)

for ent in doc.ents:
    ent_text = ent.text
    ent_start = ent.start_char
    ent_end = ent.end_char
    ent_context = text[ent_start:ent_end]


    ent_sentiment = analyzer.polarity_scores(ent_context)
    sentiment_label = 'Neutral'
    if ent_sentiment['compound'] >= 0.05:
        sentiment_label = 'Positive'
    elif ent_sentiment['compound'] <= -0.05:
        sentiment_label = 'Negative'

    print(f"Entity: {ent.text}, Label: {ent.label_}, Sentiment: {sentiment_label}")

Entity: Turkcell akıllı cihaz eski tip olan t40 ve t50 yi kullandim ve t50 yi kullaniyorum hala gayet de iyi bi telefon t50 nin arka kapak yokdu sabit bir kasasi vardi ariza durumynda tlf cope gidiyordu tek sorun buydu yeni modellerde bunu ortadan kaldirmislar gordugum kadariyla seyyar kapak yok ise tavsiye etmiyorum geri kalan bu fiyatta normal ozellikler, Label: Turkcell, Sentiment: Neutral


In [11]:
nlp.to_disk('ner_model')

In [12]:
import spacy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import json

nlp = spacy.load("ner_model")
analyzer = SentimentIntensityAnalyzer()

text = "Turkcell akıllı cihaz eski tip olan t40 ve t50 yi kullandim ve t50 yi kullaniyorum hala gayet de iyi bi telefon t50 nin arka kapak yokdu sabit bir kasasi vardi ariza durumynda tlf cope gidiyordu tek sorun buydu yeni modellerde bunu ortadan kaldirmislar gordugum kadariyla seyyar kapak yok ise tavsiye etmiyorum geri kalan bu fiyatta normal ozellikler"
doc = nlp(text)

entity_list = []
results = []

for ent in doc.ents:
    ent_text = ent.text
    ent_start = ent.start_char
    ent_end = ent.end_char
    ent_context = text[ent_start:ent_end]
    ent_sentiment = analyzer.polarity_scores(ent_context)
    sentiment_label = 'nötr'
    if ent_sentiment['compound'] >= 0.05:
        sentiment_label = 'olumlu'
    elif ent_sentiment['compound'] <= -0.05:
        sentiment_label = 'olumsuz'
    entity_list.append(ent_text)
    results.append({
        "entity": ent_text,
        "sentiment": sentiment_label
    })
output = {
    "entity_list": entity_list,
    "results": results
}
print(json.dumps(output, ensure_ascii=False, indent=4))


{
    "entity_list": [
        "Turkcell akıllı cihaz eski tip olan t40 ve t50 yi kullandim ve t50 yi kullaniyorum hala gayet de iyi bi telefon t50 nin arka kapak yokdu sabit bir kasasi vardi ariza durumynda tlf cope gidiyordu tek sorun buydu yeni modellerde bunu ortadan kaldirmislar gordugum kadariyla seyyar kapak yok ise tavsiye etmiyorum geri kalan bu fiyatta normal ozellikler"
    ],
    "results": [
        {
            "entity": "Turkcell akıllı cihaz eski tip olan t40 ve t50 yi kullandim ve t50 yi kullaniyorum hala gayet de iyi bi telefon t50 nin arka kapak yokdu sabit bir kasasi vardi ariza durumynda tlf cope gidiyordu tek sorun buydu yeni modellerde bunu ortadan kaldirmislar gordugum kadariyla seyyar kapak yok ise tavsiye etmiyorum geri kalan bu fiyatta normal ozellikler",
            "sentiment": "nötr"
        }
    ]
}
