# Authentication and Data Reading

In [1]:
 #GPU kullanmak için kimlik doğrulama işlemi
from google.colab import drive
drive.mount('/content/drive/')
#%cd /gdrive

Mounted at /content/drive/


In [2]:
import os
os.chdir("/content/drive/My Drive/BertTest/")

In [3]:
#Dataseti okuma
#Verinin okunması 
import pandas as pd
df = pd.read_csv('preprocessed_clean_dataset.csv', encoding='utf-8-sig')

# Labeling

In [4]:
possible_labels = df.category.unique()

In [5]:
label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index

In [6]:
#Verinin kategoriye dair etiketini alır
df['label'] = df.category.replace(label_dict)

# Identify Training and Validation Sets

In [7]:
# Optimum Gridsearch parametreleri tanımlanır

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.ensemble import RandomForestClassifier
#hyper parametre için kullanılan parametre gridi
param_grid = {
    'max_depth': [32],
    'n_estimators': [512]
}

randomForest = RandomForestClassifier()

#GridsearchCV'nin içine estimatorda algoritmanı param_grid ile oluşturduğun parametre gridi
#CV ile cross validation overfiti engelleme sayısı veriliyor
grid = GridSearchCV(estimator = randomForest, param_grid = param_grid, cv = 2)

X_train, X_test, y_train, y_test = train_test_split(df['final'], df['category'],  test_size=0.10)
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
clf = randomForest.fit(X_train_tfidf, y_train)

# Find Optimum GridSearch Parameter

In [8]:
#fit the model
grid.fit(X_train_tfidf, y_train)

#returns the estimator with the best performance
print(grid.best_estimator_)

#returns the best score
print(grid.best_score_)

#returns the best parameters
print(grid.best_params_)

RandomForestClassifier(max_depth=32, n_estimators=512)
0.863960113960114
{'max_depth': 32, 'n_estimators': 512}


# Balance the Dataset

In [None]:
# Oversample and plot imbalanced dataset with SMOTE
from collections import Counter
from imblearn.over_sampling import SMOTE

# summarize class distribution
counter = Counter(y_train)

print(counter)
# transform the dataset
oversample = SMOTE()
X_train_new_counts = count_vect.transform(X_train)
X_train_new_tfidf = tfidf_transformer.transform(X_train_new_counts)
X_train, y_train = oversample.fit_resample(X_train_new_tfidf, y_train)
# summarize the new class distribution
counter = Counter(y_train)
print(counter)

Counter({'Yapay Zeka': 500, 'Görüntü İşleme': 381, 'Haberleşme ve Sinyalleşme': 182, 'IoT': 165, 'Artırılmış ve Sanal Gerçeklik': 98})
Counter({'IoT': 500, 'Yapay Zeka': 500, 'Haberleşme ve Sinyalleşme': 500, 'Görüntü İşleme': 500, 'Artırılmış ve Sanal Gerçeklik': 500})


# Testing

In [9]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report

print("Start applying on testing data set")
X_test_new_counts = count_vect.transform(X_test)
X_test_new_tfidf = tfidf_transformer.transform(X_test_new_counts)
predictions = clf.predict(X_test_new_tfidf)
print("Model applied on testing data set")
print("***********Classification Report***********")
print(classification_report(y_test, predictions))

Start applying on testing data set
Model applied on testing data set
***********Classification Report***********
                               precision    recall  f1-score   support

Artırılmış ve Sanal Gerçeklik       0.90      0.90      0.90        10
               Görüntü İşleme       0.89      0.82      0.85        49
    Haberleşme ve Sinyalleşme       1.00      0.81      0.89        21
                          IoT       0.89      0.89      0.89        19
                   Yapay Zeka       0.80      0.91      0.85        57

                     accuracy                           0.87       156
                    macro avg       0.90      0.87      0.88       156
                 weighted avg       0.87      0.87      0.87       156



In [10]:
from sklearn import metrics

print("Accuracy:",metrics.accuracy_score(y_test, predictions))


Accuracy: 0.8653846153846154


In [11]:
from sklearn.metrics import f1_score

# f1 score
score = f1_score(predictions, y_test, average='weighted')

# print
print("F1 Score:", score)

F1 Score: 0.8648604789858256


In [None]:
from sklearn.metrics import accuracy_score

#Overall accuraccy scoreun içerisinde
score = accuracy_score(y_test, predictions)

#crosstab ile a'yı oluşturuyorsun sonra matris yapısı ile class-wise accuracye ulaşıyorsun
a = pd.crosstab(y_test,predictions)
print(score)
print(a.max(axis=1)/a.sum(axis=1))

0.8846153846153846
category
Artırılmış ve Sanal Gerçeklik    0.846154
Görüntü İşleme                   0.807018
Haberleşme ve Sinyalleşme        0.971429
IoT                              0.777778
Yapay Zeka                       0.928571
dtype: float64
