In [None]:
import pandas as pd
import zemberek as zp
from zemberek.normalization.turkish_sentence_normalizer import TurkishSentenceNormalizer
from tqdm import tqdm
import signal

In [None]:
dataframe = pd.read_excel("Türkçe Nefret Söylemi Veri Seti_1k.xlsx", sheet_name='1000 Tweet')

dataframe = dataframe.iloc[1:, [1, 2]]

dataframe

In [None]:
morphology = zp.TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)

In [None]:
resultDict = {'__tweet__': [], '__label__': []}

cancaled = False

def handler(signum, frame):
    global cancaled
    cancaled = True
signal.signal(signal.SIGINT, handler)

for index, row in tqdm(dataframe.iterrows()):
    if cancaled:
        break
    if (row[1] == None or row[1] == "" or type(row[1]) is not str):
        continue
    try:
        normalizedSentence = normalizer.normalize(row[0])
        analysis = morphology.analyze_sentence(normalizedSentence)
        after = morphology.disambiguate(normalizedSentence, analysis)
    except:
        print("Error: ", normalizedSentence)
        continue
    words = {}
    for sentence in after.best_analysis():
        if (words.get(sentence.get_stem())):
            words[sentence.get_stem()] += 1
        else:
            words[sentence.get_stem()] = 1
    
    columns = set(list(words.keys()) + list(resultDict.keys())[2:])
    for column in columns:
        if column not in resultDict.keys():
            resultDict[column] = [0] * len(resultDict['__tweet__'])
        if column in words:
            resultDict[column].append(int(words[column]))
        else:
            resultDict[column].append(int(0))
    resultDict['__tweet__'].append(str(row[0]).replace('\n', ' '))
    resultDict['__label__'].append(str(row[1]) if row[1] != None and type(row[1]) is str else "")

In [None]:
list(resultDict.items())[1]

In [None]:
resultDataFrame = pd.DataFrame(resultDict)
resultDataFrame

In [None]:
resultDataFrame.to_csv("result.csv")

Information Gain

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("result.csv")
data

In [None]:
data.to_excel("kelime_frekans.xlsx")

In [None]:
labels = { "hiçbiri" : 0, "nefret söylemi": 1, "nefret": 1,  "saldırgan": 2, "saldırganlık": 2 }
data["__label__"] = [labels[x.lower()] for x in data["__label__"]]
data

In [None]:
features = data.iloc[:, 3:]
labels = data.iloc[:, 2]

In [None]:
import numpy as np
(np.shape(features), np.shape(labels))

In [None]:
from sklearn.feature_selection import mutual_info_classif

# Calculate the information gain for each feature
info_gain = mutual_info_classif(features.values, labels.values, discrete_features=True)


In [None]:
info_gain_set = [(index, value) for index, value in enumerate(info_gain)]

info_gain_set.sort(key=lambda x: x[1], reverse=True)
top_1000 = info_gain_set[:1000]
[(features.columns[x[0]], x[1]) for x in info_gain_set]

In [None]:
result = data[["__tweet__", "__label__"] + [features.columns[i[0]] for i in top_1000]]
result

In [None]:
result.to_excel("top_1000_kelime_frekans.xlsx")

In [None]:
# use this if you want to test data without information gain
# result = data.iloc[:, 1:]
# result

In [None]:
from sklearn.metrics import accuracy_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

accuracy_result = []

for i in tqdm(range(1000)):
    knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')

    X_train, X_test, y_train, y_test = train_test_split(result.iloc[:, 2:], result.iloc[:, 1], test_size=0.3)

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)

    f1 = f1_score(y_test, y_pred , average="macro")

    accuracy_result.append((accuracy, f1))

# average 
(sum([x[0] for x in accuracy_result]) / len(accuracy_result), sum([x[1] for x in accuracy_result]) / len(accuracy_result))

Manual Predection

In [None]:
morphology = zp.TurkishMorphology.create_with_defaults()
normalizer = TurkishSentenceNormalizer(morphology)

In [None]:
test_data = pd.DataFrame(data=[], columns=result.columns[2:])
result.columns[2:]

In [None]:

def predSentence(sentence: str, columns):
    normalizedSentence = normalizer.normalize(sentence)
    analysis = morphology.analyze_sentence(normalizedSentence)
    after = morphology.disambiguate(normalizedSentence, analysis)
    words = {}
    for sentence in after.best_analysis():
            if (words.get(sentence.get_stem())):
                words[sentence.get_stem()] += 1
            else:
                words[sentence.get_stem()] = 1
    data = [{}]
    matches = {}
    for column in columns:
            if column in words:
                matches[column] = words[column]
                data[0][column] = int(words[column])
            else:
                data[0][column] = int(0)

    parsed_data = pd.DataFrame(data=data)
    return [knn.predict(parsed_data)[0], matches]

In [None]:
sentence = """"""
predSentence(sentence)

In [None]:
dataset_1k = pd.read_excel("Türkçe Nefret Söylemi Veri Seti_1k.xlsx", sheet_name="1000 Tweet")
dataframe_1k = dataset_1k.iloc[1:, [1, 2]]
dataframe_1k

In [None]:
columns = test_data.columns
result = []
for index, row in tqdm(dataframe_1k.iterrows()):
    pred, matches = predSentence(row[0], columns)
    result.append(
        {
            "tweet": row[0],
            "label": row[1],
            "pred": pred,
            "matches": matches
        }
    )

In [None]:
df = pd.DataFrame(result)
df.to_excel("pred_1k.xlsx")