# Authentication and Data Reading

In [1]:
 #GPU kullanmak için kimlik doğrulama işlemi
from google.colab import drive
drive.mount('/content/drive/')
#%cd /gdrive

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import os
os.chdir("/content/drive/My Drive/BertTest/")

In [3]:
#Dataseti okuma
#Verinin okunması 
import pandas as pd
df = pd.read_csv('preprocessed_clean_dataset.csv', encoding='utf-8-sig')

# Labeling

In [4]:
possible_labels = df.category.unique()

In [5]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [6]:
#Verinin kategoriye dair etiketini alır
df['label'] = df.category.replace(label_dict)

# Identify Training and Validation Sets

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

X_train, X_test, y_train, y_test = train_test_split(df['final'], df['category'],  test_size=0.20)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [25]:
from sklearn.model_selection import GridSearchCV

naiveBayes = MultinomialNB()

param_grid = {
    
    'alpha': [1.0],
    'fit_prior':[True],
    'class_prior': [None]

}

grid = GridSearchCV(estimator=naiveBayes, param_grid = param_grid, cv = 2)


In [10]:
grid.get_params().keys()

dict_keys(['cv', 'error_score', 'estimator__alpha', 'estimator__class_prior', 'estimator__fit_prior', 'estimator', 'n_jobs', 'param_grid', 'pre_dispatch', 'refit', 'return_train_score', 'scoring', 'verbose'])

In [26]:
#fit the model
grid.fit(X_train_tfidf, y_train)

#returns the estimator with the best performance
print(grid.best_estimator_)

#returns the best score
print(grid.best_score_)

#returns the best parameters
print(grid.best_params_)

MultinomialNB()
0.5240384615384616
{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}


# Balance the Dataset

In [11]:
# Oversample and plot imbalanced dataset with SMOTE
from collections import Counter
from imblearn.over_sampling import SMOTE

# summarize class distribution
counter = Counter(y_train)

print(counter)
# transform the dataset
oversample = SMOTE()
X_train_new_counts = count_vect.transform(X_train)
X_train_new_tfidf = tfidf_transformer.transform(X_train_new_counts)
X_train, y_train = oversample.fit_resample(X_train_new_tfidf, y_train)
# summarize the new class distribution
counter = Counter(y_train)
print(counter)

Counter({'Yapay Zeka': 488, 'Görüntü İşleme': 348, 'Haberleşme ve Sinyalleşme': 176, 'IoT': 142, 'Artırılmış ve Sanal Gerçeklik': 94})
Counter({'Artırılmış ve Sanal Gerçeklik': 488, 'Haberleşme ve Sinyalleşme': 488, 'Görüntü İşleme': 488, 'Yapay Zeka': 488, 'IoT': 488})


# Testing

In [12]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

print("Start applying on testing data set")
X_test_new_counts = count_vect.transform(X_test)
X_test_new_tfidf = tfidf_transformer.transform(X_test_new_counts)
predictions = clf.predict(X_test_new_tfidf)
print("Model applied on testing data set")
print("***********Classification Report***********")
print(classification_report(y_test, predictions))

Start applying on testing data set
Model applied on testing data set
***********Classification Report***********
                               precision    recall  f1-score   support

Artırılmış ve Sanal Gerçeklik       1.00      0.03      0.06        30
               Görüntü İşleme       0.82      0.44      0.58        90
    Haberleşme ve Sinyalleşme       1.00      0.39      0.56        41
                          IoT       1.00      0.12      0.22        41
                   Yapay Zeka       0.45      0.99      0.62       110

                     accuracy                           0.55       312
                    macro avg       0.85      0.40      0.41       312
                 weighted avg       0.75      0.55      0.49       312



In [None]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, predictions))


Accuracy: 0.5865384615384616


In [None]:
from sklearn.metrics import f1_score

# f1 score
score = f1_score(predictions, y_test, average='weighted')

# print
print("F1 Score:", score)

F1 Score: 0.655042244330117
