# Authentication and Data Reading

In [1]:
 #GPU kullanmak için kimlik doğrulama işlemi
from google.colab import drive
drive.mount('/content/drive/')
#%cd /gdrive

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import os
os.chdir("/content/drive/My Drive/BertTest/")

In [3]:
#Dataseti okuma
#Verinin okunması 
import pandas as pd
df = pd.read_csv('preprocessed_clean_dataset.csv', encoding='utf-8-sig')

# Labeling

In [4]:
possible_labels = df.category.unique()

In [5]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [6]:
#Verinin kategoriye dair etiketini alır
df['label'] = df.category.replace(label_dict)

# Identify Training and Validation Sets

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

#GridsearchCV'nin içine estimatorda algoritmanı param_grid ile oluşturduğun parametre gridi
#CV ile cross validation overfiti engelleme sayısı veriliyor
knn = KNeighborsClassifier()
k_range = list(range(1,100))
weight_options = ["uniform", "distance"]
param_grid = dict(n_neighbors = k_range, weights = weight_options)
grid = GridSearchCV(estimator = knn, param_grid = param_grid, cv = 2)

X_train, X_test, y_train, y_test = train_test_split(df['final'], df['category'],  test_size=0.10)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
knn = KNeighborsClassifier(n_neighbors = 41).fit(X_train_tfidf, y_train)

# Find Optimum GridSearch Parameter

In [8]:
#fit the model
grid.fit(X_train_tfidf, y_train)

#returns the estimator with the best performance
print(grid.best_estimator_)

#returns the best score
print(grid.best_score_)

#returns the best parameters
print(grid.best_params_)

KNeighborsClassifier(n_neighbors=40)
0.8084045584045585
{'n_neighbors': 40, 'weights': 'uniform'}


# Balance the Dataset

In [None]:
# Oversample and plot imbalanced dataset with SMOTE
from collections import Counter
from imblearn.over_sampling import SMOTE

# summarize class distribution
counter = Counter(y_train)

print(counter)
# transform the dataset
oversample = SMOTE()
X_train_new_counts = count_vect.transform(X_train)
X_train_new_tfidf = tfidf_transformer.transform(X_train_new_counts)
X_train, y_train = oversample.fit_resample(X_train_new_tfidf, y_train)
# summarize the new class distribution
counter = Counter(y_train)
print(counter)

Counter({'Yapay Zeka': 502, 'Görüntü İşleme': 381, 'Haberleşme ve Sinyalleşme': 181, 'IoT': 158, 'Artırılmış ve Sanal Gerçeklik': 104})
Counter({'Yapay Zeka': 502, 'Görüntü İşleme': 502, 'Artırılmış ve Sanal Gerçeklik': 502, 'IoT': 502, 'Haberleşme ve Sinyalleşme': 502})


# Testing

In [8]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

print("Start applying on testing data set")
X_test_new_counts = count_vect.transform(X_test)
X_test_new_tfidf = tfidf_transformer.transform(X_test_new_counts)
predictions = knn.predict(X_test_new_tfidf)
print("Model applied on testing data set")
print("***********Classification Report***********")
print(classification_report(y_test, predictions))

Start applying on testing data set
Model applied on testing data set
***********Classification Report***********
                               precision    recall  f1-score   support

Artırılmış ve Sanal Gerçeklik       1.00      0.77      0.87        13
               Görüntü İşleme       0.71      0.88      0.78        33
    Haberleşme ve Sinyalleşme       0.85      0.77      0.81        22
                          IoT       0.65      0.61      0.63        18
                   Yapay Zeka       0.84      0.81      0.83        70

                     accuracy                           0.79       156
                    macro avg       0.81      0.77      0.78       156
                 weighted avg       0.80      0.79      0.80       156



In [9]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, predictions))


Accuracy: 0.7948717948717948


In [10]:
from sklearn.metrics import f1_score

# f1 score
score = f1_score(predictions, y_test, average='weighted')

# print
print("F1 Score:", score)

F1 Score: 0.7941082745430572
