In [20]:
import string
import numpy as np
import random as rand
import collections as col
from pprint import pprint as ppr
from collections import defaultdict as dd
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score

def extract_feature_FR(letters_text, NL_text, accuracy):
    feature_FR = dict(sorted({key: value/NL_text for key, value in dict(col.Counter(letters_text)).items()}.items()))
    round_dict(feature_FR, accuracy)
    return feature_FR

def extract_feature_WL(alphabet, words, word_length, letters_times, accuracy):
    feature_WL = dict.fromkeys(alphabet, 0)
    if word_length == 1:
        for w in words:
            if len(w) == word_length and feature_WL[w] == 0:
                feature_WL[w] = 1
    elif has_domain(word_length, 2, 4):
        for w in words:
            if len(w) == word_length:
                for l in list(w):
                    feature_WL[l] += 1/letters_times.get(l)
    elif has_domain(word_length, 5, 7):
        for w in words:
            if has_domain(len(w), 5, 7):
                for l in list(w):
                    feature_WL[l] += 1/letters_times.get(l)
    elif has_domain(word_length, 8, 10):
        for w in words:
            if has_domain(len(w), 8, 10):
                for l in list(w):
                    feature_WL[l] += 1/letters_times.get(l)
    else:
        for w in words:
            if len(w) >= word_length:
                for l in list(w):
                    feature_WL[l] += 1/letters_times.get(l)
                    
    round_dict(feature_WL, accuracy)
    return feature_WL

def extract_feature_SW(alphabet, case, words, letters_times, accuracy):
    feature_SW = dict.fromkeys(alphabet, 0)
    if "first" in case:
        for w in words:
            first = list(w)[0]
            feature_SW[first] += 1/letters_times.get(first)
    elif "last" in case:
        for w in words:
            last = list(w)[len(w)-1]
            feature_SW[last] += 1/letters_times.get(last)    
    else:
        for w in words:
            first = list(w)[0]
            last = list(w)[len(w)-1]
            if first == last:
                feature_SW[first] += 1/letters_times.get(first)
    round_dict(feature_SW, accuracy)
    return feature_SW

def extract_feature_DL(alphabet, words, letters_times, accuracy):
    feature_DL = dict.fromkeys(alphabet, 0)
    for w in words:
        if len(w) != 1:
            prev_letter = "#"
            for l in list(w):
                if prev_letter == l:
                    feature_DL[l] += 1/letters_times.get(l)
                prev_letter = l
    return feature_DL

def has_domain(var, point1, point2):
    if var >= point1 and var <= point2:
        return True
    else:
        return False

def minimize_dataset(old_dataset):  # removes duplicate words from text
    new_dataset = []
    for w in old_dataset: 
        if w not in new_dataset: 
            new_dataset.append(w)
    return new_dataset

def round_dict(dict, accuracy):
    for key in dict: dict[key] = round(dict.get(key),accuracy)
    return dict

def get_letters(words):
    temp = []
    for w in words:
        temp.append(list(w))
    letters = [letter for word in temp for letter in word]
    return letters

def process(data_file, alphabet):
    
    accuracy = 10

    with open(data_file, 'r') as f:
        words_text = f.read().split()

    letters_text = get_letters(words_text)
    NL_text = len(letters_text)

    words = minimize_dataset(words_text)
    letters = get_letters(words)
    NL = len(letters)
    letters_times = dict(sorted({key: value for key, value in dict(col.Counter(letters)).items()}.items()))
    letters_freqs = {key: value/NL for key, value in letters_times.items()}

    feature_0 = extract_feature_FR(letters_text, NL_text, accuracy)
    feature_1 = extract_feature_WL(alphabet, words, 1, letters_times, accuracy)
    feature_2 = extract_feature_WL(alphabet, words, 2, letters_times, accuracy)
    feature_3 = extract_feature_WL(alphabet, words, 3, letters_times, accuracy)
    feature_4 = extract_feature_WL(alphabet, words, 4, letters_times, accuracy)
    feature_5 = extract_feature_WL(alphabet, words, rand.randint(5,7), letters_times, accuracy)
    feature_6 = extract_feature_WL(alphabet, words, rand.randint(8,10), letters_times, accuracy)
    feature_7 = extract_feature_WL(alphabet, words, 11, letters_times, accuracy)
    feature_8 = extract_feature_SW(alphabet, "first", words, letters_times, accuracy)
    feature_9 = extract_feature_SW(alphabet, "last", words, letters_times, accuracy)
    feature_10 = extract_feature_SW(alphabet, "both", words, letters_times, accuracy)
    feature_11 = extract_feature_DL(alphabet, words, letters_times, accuracy)
    
    features = dd(list) # defining a dictionary

# feature_0, feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7, feature_8, feature_9, feature_10, feature_11
   
    for d in (feature_0, feature_1, feature_2, feature_11):
        for key, value in d.items():
            features[key].append(value)
    features = dict(features)      # get only the dictionary-part
    features = [val for key, val in features.items()]     # convert dictionary of lists into a list of lists

    X = np.array(features)

    return X
        
def main():
    
    text_v1 = "TESTING-tolstoy-anna-karenina-v1.txt"
    alphabet_v1 = "abcdefghijklmnopqrstuvwxyz"
    
    text_v2 = "TESTING-tolstoy-war-and-peace.txt"
    alphabet_v2 = "krlpfibtovyzqjnsmhcwexudag"
    
    alphabet = list(string.ascii_lowercase)
    np.set_printoptions(suppress=True)  # to avoid scientific notation when printing

    X_train = process("TRAINING-tolstoy-anna-karenina.txt", alphabet)
    y_train = np.array(alphabet)
    svc = SVC()
    svc.fit(X_train, y_train)

    X_test = process(text_v2, alphabet)
    y_test = list(alphabet_v2)  # the actual y values of the test-text
    y_pred = svc.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    #print(y_test)
    #print(y_pred)
    print(accuracy)
    
    comparison = []
    for l in range(len(alphabet)):
        comparison.append(y_test[l]+y_pred[l])
    print(comparison)
  
"""
    confusion = confusion_matrix(y_test, y_pred)
    print(confusion)
"""

if __name__ == "__main__":
    main()

0.07692307692307693
['ky', 'rg', 'ls', 'pb', 'fu', 'ie', 'bz', 'tr', 'of', 'vn', 'ya', 'zc', 'qq', 'jo', 'ni', 'sd', 'mm', 'hb', 'cp', 'wh', 'ew', 'xq', 'ut', 'dv', 'ak', 'gl']
