In [5]:
import nltk
import time
import pandas as pd

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mehmet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mehmet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
dataset_df = pd.read_excel("Dataset/raw_dataset_20_02_2022.xlsx")
dataset_df.head()

Unnamed: 0,Source,Category,Link,Title,Summary,Context,Date
0,Ensonhaber,Automobile,https://www.ensonhaber.com/otomobil/turkiye-20...,"Türkiye, 2020'de 9.5 milyar dolarlık binek oto...",Türkiye'den 2020'de 118 ülke ve özerk bölgeye ...,Uludağ Otomotiv Endüstrisi İhracatçıları Birli...,2021/01/20
1,Ensonhaber,Living,https://www.ensonhaber.com/yasam/mpi-4-subat-2...,MPİ 3 Şubat 2022 Süper Loto sonuçları: Büyük i...,Milli Piyango İdaresi tarafından canlı çekilen...,3 Şubat 2022 Perşembe tarihli çekiliş sonuçlar...,2022/02/03
2,Ensonhaber,Automobile,https://www.ensonhaber.com/otomobil/ilk-8-ayda...,İlk 8 ayda otomotiv üretimi yüzde 14 arttı,Ağustos sonu itibarıyla toplam otomotiv üretim...,"Otomotiv Sanayii Derneği (OSD), ocak-ağustos d...",2021/09/14
3,Ensonhaber,Health,https://www.ensonhaber.com/saglik/etten-daha-f...,Etten daha fazla protein içeren yer fıstığının...,İyi bir protein kaynağı olan ve aynı zamanda k...,Cips gibi tipik atıştırmalık yiyeceklerin çoğu...,2022/01/24
4,Ensonhaber,Living,https://www.ensonhaber.com/kadin/iletisim-kura...,İletişim kurarken güven vermenin en etkili 6 yolu,Sosyal ya da iş hayatında iletişim kurarken ka...,"Kendine güven, becerilerinize, niteliklerinize...",2022/02/01


In [7]:
# Get Data count per class
dataset_df.groupby("Category")["Category"].count()

Category
Automobile    5218
Daily         5345
Economy       5390
Health        5383
Living        5271
Magazine      5329
Sport         5356
Technology    5371
Name: Category, dtype: int64

Processing Dataset for Training

In [8]:
# processing data with NLTK

# I tried "from snowballstemmer import TurkishStemmer" for stemming but it takes too long
# snowballTurkishStemmer took 0.018998384475708008 seconds! for one doc
# I also tried "from TurkishStemmer import TurkishStemmer". It is slow too but it is faster than snowballstemmer
# TurkishStemmer took 0.005014657974243164 seconds! for one doc

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from TurkishStemmer import TurkishStemmer


stemmer = TurkishStemmer()

processed_dataset_by_category = {

}

unique_words_by_class_with_count = {

}

def is_word_has_digit(word_string):
    for char in word_string:
        if char.isdigit():
            return True
    return False

dataset_length = len(dataset_df)
start_time = time.time()


turkish_stop_words = stopwords.words('turkish')
# appanding more turkish stop words
with open("Models/more_turkish_stop_words.txt", "r", encoding="utf-8") as f:
    new_stop_words = f.read().splitlines()
    turkish_stop_words.extend(new_stop_words)

# using set will make all words unique and it will make "if" condition faster (because it is use hashtable)
turkish_stop_words = set(turkish_stop_words)

# create regex
regex_tokenizer = nltk.RegexpTokenizer(r"\w+")

for index, row in dataset_df.iterrows():
    unique_words_by_class_with_count.setdefault(row["Category"], {})
    processed_dataset_by_category.setdefault(row["Category"], [])

    tokenized_words = regex_tokenizer.tokenize(row["Context"])
    # make all words lower case
    tokenized_words = [word.lower() for word in tokenized_words]
    # remove turkish stop words
    tokenized_words = [word for word in tokenized_words if word not in turkish_stop_words]
    # remove digits 
    tokenized_words = [word for word in tokenized_words if not is_word_has_digit(word)]
    # remove char lenth smaller than 2
    tokenized_words = [word for word in tokenized_words if len(word) > 2]
    # stem words
    tokenized_words = [stemmer.stem(word) for word in tokenized_words]

    
    # adding processed data to a list
    processed_dataset_by_category[row["Category"]].append(tokenized_words)
    # set and count words
    for word in tokenized_words:
        unique_words_by_class_with_count[row["Category"]].setdefault(word, 0)
        unique_words_by_class_with_count[row["Category"]][word] += 1

print(f"{dataset_length} data processing took {time.time() - start_time} seconds")

42663 data processing took 379.8416533470154 seconds


In [9]:
# Calculate most frequent words per class

total_unique_word_count = 0
max_values = 10
for category, words_with_count in unique_words_by_class_with_count.items():
    total_unique_word_count += len(words_with_count)
    sorted_words_count = sorted(words_with_count.items(), key=lambda kv: kv[1])[-max_values:]
    print(f"{category}: {sorted_words_count}")

print(f"Total Unique Word Count: {total_unique_word_count}")

Automobile: [('sahip', 3798), ('elektrik', 4428), ('motor', 5910), ('model', 6154), ('satış', 6189), ('yen', 8117), ('yüz', 8376), ('yıl', 8475), ('otomobil', 10997), ('araç', 14040)]
Living: [('mah', 5775), ('zaman', 5872), ('gün', 6622), ('yer', 6898), ('saat', 8095), ('bel', 10147), ('sokak', 14658), ('köy', 15136), ('mahalle', 15732), ('merkez', 18653)]
Health: [('özellik', 6064), ('fazl', 6113), ('yardımç', 6271), ('etki', 6298), ('zaman', 6397), ('önem', 6567), ('gün', 6693), ('tedavi', 6895), ('hastalık', 12264), ('sağlık', 13989)]
Daily: [('cumhurbaşkan', 3159), ('devam', 3197), ('son', 3276), ('ifade', 3310), ('karar', 3391), ('erdoğan', 3561), ('gün', 3777), ('yer', 4146), ('yıl', 5018), ('türki', 5547)]
Sport: [('süper', 2662), ('kulüp', 2937), ('sezon', 3006), ('beşiktaş', 3220), ('son', 4029), ('galatasaray', 4587), ('fenerbahçe', 4628), ('lig', 6870), ('takım', 7025), ('maç', 11077)]
Technology: [('oyun', 3145), ('özellik', 3360), ('apple', 3510), ('şirket', 3590), ('son'

In [10]:
# set dataset for training 

dataset_x = []
dataset_y = []

categories = list(processed_dataset_by_category.keys())
default_label = [0 for i in range(len(categories))]

for category, docs in processed_dataset_by_category.items():
    for doc in docs:
        dataset_x.append(" ".join(doc))
        label = default_label.copy()
        label[categories.index(category)] = 1
        dataset_y.append(label)

In [11]:
import json
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer


tf_idf_dataset = {
    "Dataset": []
}

vectorizer = TfidfVectorizer(sublinear_tf=True,
                                ngram_range=(1, 2),
                                max_features=5000)

vectors = vectorizer.fit_transform(dataset_x)
dense = vectors.todense()

# saving vectorizer for using in api
# vectorizer fitted with using processed dataset
# for getting tf-idf values for a unseen data (which is not include dataset) we need to use this vectorizer 
pickle.dump(vectorizer, open(f"Models/tfidf_vectorizer.pkl", "wb"))


In [12]:
# save processed tf-idf data
import random
# we are using tolist() because we want to save as json
tf_idf_dataset["FeatureNames"] = vectorizer.get_feature_names_out().tolist()

# get 15 extracted feature from tf-idf vectorizer
random.sample(tf_idf_dataset["FeatureNames"], 15)


['ikna',
 'ulusal',
 'görmek',
 'harcama',
 'yaklaş',
 'gerçekleşen',
 'etkileyen',
 'lif',
 'hayati',
 'il',
 'dans',
 'özel',
 'pilot',
 'mağlubiyet',
 'zafer']

In [13]:
tf_idf_dataset["ClassNames"] = categories

# also i will save class names for api
with open("Models/class_names.txt", "a") as f:
    for category in categories:
        f.write(f"{category}\n")

for data_x, data_y in zip(dense, dataset_y):
    tf_idf_dataset["Dataset"].append({"x": data_x.tolist()[0], "y": data_y}) 
   
# save dataset for any crash  
# i can't upload this file to github because it is 1GB so i will upload this to drive and i will share link in README of Dataset folder or you can run this .ipynb to extract it
json.dump(tf_idf_dataset, open("Dataset/tf-idf-dataset.json", "w"))