In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
from matplotlib import pyplot as plt

In [3]:
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, classification_report

## Podela na trening i test skup

In [4]:
from sklearn.model_selection import train_test_split

```
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.3, random_state=23, stratify=Y)
```

## Provera balansiranosti klasa (plt.hist)
```
import numpy as np

freq, bins, _ = plt.hist(Y, edgecolor='white', bins=2)

bin_centers = np.diff(bins)*0.5 + bins[:-1]   # računanje centra korpice

for fr, bc in zip(freq, bin_centers):
    height = int(fr)
    plt.annotate("{}".format(height),
               xy = (bc, height),             # poziciranje broja iznad korpice
               xytext = (0,0.2),              # odvajanje teksta od korpice
               textcoords = "offset points", 
               ha = 'center', va = 'bottom'
               )

plt.show()
```

## Standardizacija

In [5]:
from sklearn.preprocessing import StandardScaler

```
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)
```

## Normalizacija

In [13]:
from sklearn.preprocessing import MinMaxScaler

```
scaler = MinMaxScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)
```

## Stabla odlucivanja

In [6]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

```
dtc = DecisionTreeClassifier()
dtc.fit(X_train, Y_train)

Y_test_pred = dtc.predict(X_test)

confusion_matrix(Y_test, Y_test_pred)
classification_report(Y_test, Y_test_pred)

plt.figure(figsize=(7,7))
plot_tree(dtc, class_names=['B','M'], feature_names=feature_names, filled=True)
plt.title("Decision tree of depth {} with {} nodes".format(dtc.get_depth(), dtc.get_n_leaves()))
plt.show()
```

## GridSearchCV

In [7]:
from sklearn.model_selection import GridSearchCV

```
params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [2, 4, 6, 8]
}

estimator = GridSearchCV(DecisionTreeClassifier(), params, cv=5, verbose=4)
# estimator = GridSearchCV(DecisionTreeClassifier(), params, cv=5, verbose=4, scoring='precision')
# estimator = GridSearchCV(DecisionTreeClassifier(), params, cv=5, verbose=4, scoring='accuracy')

estimator.fit(X_train, Y_train)

estimator.best_estimator_
estimator.best_params_
estimator.best_score_

Y_test_pred = estimator.best_estimator_.predict(X_test)

confusion_matrix(Y_test, Y_test_pred)
```

## Slucajne sume

In [8]:
from sklearn.ensemble import RandomForestClassifier

```
random_forest = RandomForestClassifier(n_estimators=5, random_state=42)
random_forest.fit(X_train, Y_train)

Y_test_pred = random_forest.predict(X_test)

confusion_matrix(Y_test, Y_test_pred)
classification_report(Y_test, Y_test_pred)


for i in range(1, len(random_forest.estimators_) + 1):
    plt.subplot(1, 5, i)
    #report(random_forest.estimators_[i-1], X_test, Y_test)
    plot_tree(random_forest.estimators_[i-1], filled=True)
```

## ROC kriva

In [9]:
from sklearn.metrics import roc_curve, roc_auc_score

```
models = [dtc, estimator.best_estimator_, random_forest]
model_names = ['DecisionTree', 'GridSearchCV', 'RandomForest']

for model, model_name in zip(models, model_names):
    y_pred = model.predict(X_test)

    fpr,tpr, _ = roc_curve(Y_test, y_pred)
    auc = roc_auc_score(Y_test, y_pred)

    lab = model_name + " (auc: " + str(round(auc, 2)) + ")"
    plt.plot(fpr, tpr, label=lab)

plt.plot([0, 1], [0, 1], label='Random (auc: 0.5)', color='red')
plt.title("Poređenje modela")
plt.legend()
plt.show()
```

## KNN

```
# Potrebna normalizacija (nakon podele na test/trening skup)

def IQR(data, feature_names):
    
    iqr = pd.DataFrame(0, index=feature_names, columns=['lower', 'min', 'num_lower', 'upper', 'max', 'num_upper', 'percantage'])
    for name in feature_names:

        (Q1, Q3) = X[name].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        upper = Q3 + (1.5 * IQR)
        lower = Q1 - (1.5 * IQR)
        
        iqr.loc[name, 'upper'] = upper
        iqr.loc[name, 'lower'] = lower

        num_of_out_lower = (X[name] < lower).sum()
        num_of_out_upper = (X[name] > upper).sum()

        percentage = round((num_of_out_lower + num_of_out_upper) / X.shape[0] * 100)

        iqr.loc[name, 'num_lower'], iqr.loc[name, 'num_upper'], iqr.loc[name, 'percantage']  = num_of_out_lower, num_of_out_upper, percentage
        iqr.loc[name, 'min'], iqr.loc[name, 'max'] = min(X[name]), max(X[name])    
    return iqr
```

In [14]:
from sklearn.neighbors import KNeighborsClassifier

```
knn = KNeighborsClassifier()
knn.fit(X_train, Y_train)

Y_test_pred = knn.predict(X_test)

confusion_matrix(Y_test,Y_test_pred)
classification_report(Y_test,Y_test_pred)

#### Grid SearchCV ####

from sklearn.model_selection import GridSearchCV

params_grid = {'n_neighbors': range(10, 50, 5),
               'weights': ['uniform', 'distance'],
                'p': [1, 2]}

estimator = GridSearchCV(KNeighborsClassifier(), params_grid, cv=6, verbose=4)
estimator.fit(X_train,Y_train)

#### Ansambli ####

from sklearn.ensemble import BaggingClassifier

baggingKnn = BaggingClassifier(estimator=KNeighborsClassifier(), n_estimators=20)
baggingKnn.fit(X_train, Y_train)

Y_test_pred = baggingKnn.predict(X_test)

confusion_matrix(Y_test,Y_test_pred)
classification_report(Y_test,Y_test_pred)
```

## Categorical Naive Bayes

In [15]:
# 1. potrebno je pretvoriti kategoricke atribute u brojeve
from sklearn.preprocessing import OrdinalEncoder

```
oe = OrdinalEncoder()
oe.fit(X_train)

oe.categories_

X_train = oe.transform(X_train)
X_test = oe.transform(X_test)
```

In [16]:
from sklearn.naive_bayes import CategoricalNB

```
bayes = CategoricalNB()
bayes.fit(X_train, Y_train)

bayes.category_count_
bayes.class_count_

Y_test_pred = bayes.predict(X_test)

confusion_matrix(y_test, y_test_pred)
```

In [19]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB

```
dv = DictVectorizer()
dv.fit(X_train)

dv.feature_names_

sparse_matrix = dv.transform(X_train)
sparse_matrix.toarray()

X_train = pd.DataFrame(sparse_matrix.toarray(), columns = dv.feature_names_)

###

bayes = MultinomialNB()
bayes.fit(X_train, Y_train)

bayes.classes_
bayes.class_count_
bayes.feature_count_

Y_test_pred = bayes.predict(X_test)

confusion_matrix(y_test, y_test_pred)
```

### Klasifikacija teksta za dobijanje  podataka

In [21]:
import os

```
def read_data(root_dir):
    corpus = []
    classes = []
    for class_name in os.listdir(root_dir):
        class_dir = os.path.join(root_dir, class_name)
        for file_name in os.listdir(class_dir):
            file_path = os.path.join(class_dir, file_name)
            word_counts = {}
            with open(file_path, 'r') as f:
                for line in f:
                    word, count = line.split()
                    word_counts[word] = int(count)
            corpus.append(word_counts)
            classes.append(class_name)
    return corpus, classes

X_train, y_train = read_data('./ebart/VektoriEbart-5/Skup/')
```