In [115]:
import os
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn import tree
import warnings
warnings.filterwarnings("ignore")

In [116]:
data_path = "./data/"
file_list = os.listdir(data_path)
func = lambda  a : a.replace(".txt", "").split("-")
count_size = lambda a : os.stat(a).st_size
data = [[func(title)[0], func(title)[1], data_path + title, count_size(data_path + title)] for title in file_list]
files = pd.DataFrame(data, columns=['Author', 'Name of The work', 'FilePath', 'FileSize'])

In [117]:
def get_stop_words():
    delimeters = ["\\n", "'"]
    stop_words = []
    with open("./stopyPL.txt", encoding="utf8") as f:
        for line in f:
            word = repr(line)
            for c in delimeters:
                word = word.replace(c, "")
            stop_words.append(word)
    return stop_words

In [118]:
def normalize_word(word):
    word = word.lower()
    special_char = [".", ",", "-", "?", "(", ")", "!", "\\", "\"", ":", ";", "*"]
    for char in special_char:
        word = word.replace(char, "")
    return word

# 0 - generete bag with all words, 1 - bag without stopwords, 2 - bag with only stopwords
def generate_word_bag(FileName, mode=0):
    stop_word = get_stop_words()
    word_bag = {}
    words_in_bag = 0
    with open(FileName, "r", encoding="utf8") as f:
        for line in f:
            words = line.split()
            words = [normalize_word(word) for word in words]
            for word in words:
                if word != '':
                    if (mode==0) or (mode==1 and not word in stop_word) or (mode==2 and word in stop_word):
                        words_in_bag += 1
                        if word in word_bag.keys():
                            word_bag[word] += 1
                        else:
                            word_bag[word] = 1
    return word_bag   

In [119]:
def merge_dics_to_df(dics, labels=None):
    dfs = []
    for dic in dics:
        df = pd.DataFrame(columns=dic.keys())
        df.loc[0] = dic.values()
        df = df.dropna(axis=1, how='all')
        dfs.append(df)
    df = pd.concat(dfs, axis=0, ignore_index=True)
    df = df.replace(np.nan, 0)
    return df

In [120]:
def load_data(mode=2):
    word_bags = []
    index = []
    labels = []

    for path, name, label in zip(files['FilePath'], files['Name of The work'], files['Author']):
        word_bags.append(generate_word_bag(path, mode=mode))
        index.append(name)
        labels.append(label)
        
    df = merge_dics_to_df(word_bags, index)
    df['label'] = labels
    return df

In [121]:
def get_learning_data(df, proporcja):
    opis_ucz, opis_test, dec_ucz, dec_test = train_test_split(df.iloc[:,0:-1], df.iloc[:,-1].astype('category').cat.codes, test_size=proporcja)#, random_state=0)
    return {"opis_ucz":opis_ucz, "opis_test":opis_test, "dec_ucz":dec_ucz, "dec_test":dec_test}

In [122]:
def tfidf(data):
    df = data.copy()
    df = df.drop(columns=['label'])
    nwords = df.shape[1]
    nbooks = df.shape[0]
    tf = np.empty((nbooks, nwords))
    sm = df.sum(axis=1)
    for i in range(nbooks):
        tf[i,:] = np.array(df.iloc[i,: ] / sm[i])
    inbooks = np.array((df > 0)*1).sum(axis=0)
    idf = [math.log(nbooks / val, 10) for val in inbooks]
    tfidf = pd.DataFrame((tf*np.array([idf,]*nbooks)*100), columns=df.columns)
    return tfidf

In [123]:
def get_learning_data(df, proporcja):
    opis_ucz, opis_test, dec_ucz, dec_test = train_test_split(df.iloc[:,0:-1], df.iloc[:,-1].astype('category').cat.codes, test_size=proporcja)#, random_state=0)
    return {"opis_ucz":opis_ucz, "opis_test":opis_test, "dec_ucz":dec_ucz, "dec_test":dec_test}

In [124]:
def weryfikuj(model,dane,show=True):
    model.fit(dane["opis_ucz"], dane["dec_ucz"])
    wynik_ucz = model.predict(dane["opis_ucz"])
    wynik_test = model.predict(dane["opis_test"])

    learn_s = model.score(dane['opis_ucz'], dane['dec_ucz'])
    test_s = model.score(dane['opis_test'], dane['dec_test'])
    
    if show:
        print("\tWynik dla danych uczących: ", end="")
        print(learn_s)
        print("\tWynik dla danych testowych: ", end="")
        print(test_s)
    
    return learn_s, test_s

In [125]:
def get_model():
    models = {"NS": KNeighborsClassifier(n_neighbors=1), 
                "KNS" : KNeighborsClassifier(n_neighbors=5),
                "NP" : NearestCentroid(),
                "BK" : GaussianNB(),
                "DT" : tree.DecisionTreeClassifier(max_depth=5)}

    names = ["NS", "KNS", "NP", "BK", "DT"]
    for name in names:
        yield models[name]
    yield None

In [127]:
tries = 100
df = tfidf(load_data(mode=2))
model = KNeighborsClassifier(n_neighbors=1)

for mod in get_model():
    if mod is None:
        break
    sum_t = 0
    sum_l = 0
    for i in range(tries):
        data = get_learning_data(df, 0.2)
        d_l, d_t= weryfikuj(model, data,show=False)
        sum_l += d_l
        sum_t += d_t
    print("Dla modelu: " + str(mod))
    print("    Dla danych uczących : " + str(sum_l/tries))
    print("    Dla danych testowych: " + str(sum_t/tries))

Dla modelu: KNeighborsClassifier(n_neighbors=1)
    Dla danych uczących : 1.0
    Dla danych testowych: 0.9444444444444434
Dla modelu: KNeighborsClassifier()
    Dla danych uczących : 1.0
    Dla danych testowych: 0.93111111111111
Dla modelu: NearestCentroid()
    Dla danych uczących : 1.0
    Dla danych testowych: 0.9388888888888879
Dla modelu: GaussianNB()
    Dla danych uczących : 1.0
    Dla danych testowych: 0.9366666666666658
Dla modelu: DecisionTreeClassifier(max_depth=5)
    Dla danych uczących : 1.0
    Dla danych testowych: 0.9277777777777765
