# Named Entity Extraction using GLiNER

### Team: Tensor Titans
### Ghazal Askari, Mohammadreza Vilani, Sepideh Soleimanian, Amirhosein Rajabi, Yasamin Sarrafi

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings
warnings.filterwarnings('ignore')

In [None]:
!pip install -q gliner
from gliner import GLiNER

In [None]:
# available models: https://huggingface.co/urchade

model = GLiNER.from_pretrained("urchade/gliner_multi-v2.1")
model.eval()
print("ok")

#### Here, we verify our model works, with a simple sentence:

In [None]:
# to do (edit labels)

text = """
پرستو در پایان هفته به رهنما کالج رفت و در اسنپ با دوستانش درباره کتاب موراکامی -کافکا در کرانه- صحبت کرد
"""

labels = ["person", "location", "date", "organization", "book"]

# Lower the threshold to increase how many entities get predicted
entities = model.predict_entities(text, labels, threshold=0.2)

for entity in entities:
    print(entity["text"], "=>", entity["label"])

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv("/kaggle/input/cleaned-infopannki-fa/cleaned_infopankki-fa.csv")
df = df[["Persian","English"]] # needed columns

In [None]:
# Define your labels
labels = ["person", "location", "date", "organization", "language", "book", "movie"]

# Function to get tags for a sentence
def get_tags_for_sentence(sentence, model, labels, threshold=0.3):
    entities = model.predict_entities(sentence, labels, threshold=threshold)
    # Initialize the list with '0's
    words = sentence.split()
    tags = ['0'] * len(words)

    # Map each word to its tag
    for entity in entities:
        entity_words = entity["text"].split()
        entity_label = entity["label"]
        start_idx = next(i for i, word in enumerate(words) if word == entity_words[0])
        for i in range(len(entity_words)):
            tags[start_idx + i] = entity_label

    return ' '.join(tags)

#### using the function get_tag_for_sentence,  we do NER for the Persian Column of our model and saved the info as a string, in the column 'Fa NER'

In [None]:
# Apply the function to each row in the DataFrame
df['Fa NER'] = df['Persian'].apply(lambda x: get_tags_for_sentence(x, model, labels))

# Save the DataFrame to a CSV file
df.to_csv('ner_infopannki_output.csv', index=False)

print("NER tagging completed and saved to ner_output.csv")

#### Now we can randomly select some rows, to inspect the new NER column. 

In [None]:
df_output = pd.read_csv('/kaggle/working/ner_infopannki_output.csv')

df_output[["Persian", "Fa NER"]].iloc[80:]