In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('../datasets/alldata_1_for_kaggle.csv', encoding="latin-1")
df.head(3)

Unnamed: 0.1,Unnamed: 0,0,a
0,0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...


In [3]:
df = df.drop('Unnamed: 0', axis = 1)
df = df.rename({'0': 'target', 'a': 'feature'}, axis = 1)
df.head(3)

Unnamed: 0,target,feature
0,Thyroid_Cancer,Thyroid surgery in children in a single insti...
1,Thyroid_Cancer,""" The adopted strategy was the same as that us..."
2,Thyroid_Cancer,coronary arterybypass grafting thrombosis ï¬b...


In [4]:
"""
Now we do Data Preprocessing.
"""
import string, re, nltk
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

regexp = RegexpTokenizer("[\w']+")

def convert_to_lowercase(text):
    return text.lower()
def remove_whitespace(text):
    return text.strip()
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "") 
    return text.translate(str.maketrans("", "", punct_str))
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)
def remove_http(text):
    http = "https?://\S+|www\.\S+" 
    pattern = r"({})".format(http) 
    return re.sub(pattern, "", text)
# Stopwords
stops = stopwords.words("english") 
addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] 
allstops = stops + addstops
def remove_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in allstops])
stemmer = PorterStemmer()
def text_stemmer(text):
    text_stem = " ".join([stemmer.stem(word) for word in regexp.tokenize(text)])
    return text_stem
def discard_non_alpha(text):
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha

In [5]:
def text_normalizer(text):
    text = convert_to_lowercase(text)
    text = remove_whitespace(text)
    text = re.sub('\n' , '', text) 
    text = re.sub('\[.*?\]', '', text) 
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_stopwords(text)
    text = discard_non_alpha(text)
    return text

In [6]:
df['feature'] = df['feature'].apply(text_normalizer)

In [7]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
import gzip
# Calculate compression ratio
def calculate_compression_ratio(text):
    compressed = len(gzip.compress(text.encode()))
    original = len(text.encode())
    compression_ratio = original / compressed
    return compression_ratio

# Print compression ratios for train and test sets
train_compression_ratio = calculate_compression_ratio(" ".join(df_train['feature']))
test_compression_ratio = calculate_compression_ratio(" ".join(df_test['feature']))

print("Compression Ratio - Train Set:", train_compression_ratio)
print("Compression Ratio - Test Set:", test_compression_ratio)


Compression Ratio - Train Set: 3.397251619353535
Compression Ratio - Test Set: 3.401567560866739


In [11]:
#Statistical data about the train and test sets
# N - Number of training and test set examples
N_train = len(df_train)
N_test = len(df_test)

# C - Number of classes
C = df['target'].nunique()

# Calculate average number of words (W) and characters (L) in each example
#text is tokenized by spaces
train_word_counts = df_train['feature'].apply(lambda x: len(x.split()))
test_word_counts = df_test['feature'].apply(lambda x: len(x.split()))
train_char_counts = df_train['feature'].apply(lambda x: len(x))
test_char_counts = df_test['feature'].apply(lambda x: len(x))
W_train = train_word_counts.mean()
W_test = test_word_counts.mean()
L_train = train_char_counts.mean()
L_test = test_char_counts.mean()

# V - Vocabulary size
# count unique words in the entire dataset
all_text = " ".join(df['feature'])
vocabulary = set(all_text.split())
V = len(vocabulary)

# Print the statistics
print("N - Number of training examples:", N_train)
print("N - Number of test examples:", N_test)
print("C - Number of classes:", C)
print("W - Average number of words in each example (Train):", W_train)
print("W - Average number of words in each example (Test):", W_test)
print("L - Average number of characters in each example (Train):", L_train)
print("L - Average number of characters in each example (Test):", L_test)
print("V - Vocabulary size:", V)

N - Number of training examples: 6056
N - Number of test examples: 1514
C - Number of classes: 3
W - Average number of words in each example (Train): 2180.092470277411
W - Average number of words in each example (Test): 2203.9101717305152
L - Average number of characters in each example (Train): 18438.607661822985
L - Average number of characters in each example (Test): 18681.858652575957
V - Vocabulary size: 157912


In [9]:
import gzip

#calculate normalized compression distance (NCD)
def calculate_ncd(x1, x2):
    Cx1 = len(gzip.compress(x1.encode()))
    Cx2 = len(gzip.compress(x2.encode()))
    x1x2 = " ".join([x1, x2])
    Cx1x2 = len(gzip.compress(x1x2.encode()))
    
    ncd = (Cx1x2 - min(Cx1, Cx2)) / max(Cx1, Cx2)
    return ncd

In [10]:
#normale Gzip-Klassifikationsvariante

from tqdm import tqdm
from collections import Counter

k = 2

predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["feature"]
    test_label = row_test[1]["target"]
    c_test_text = len(gzip.compress(test_text.encode()))
    distance_from_test_instance = []
    
    for row_train in df_train.iterrows():
        train_text = row_train[1]["feature"]
        train_label = row_train[1]["target"]
        c_train_text = len(gzip.compress(train_text.encode()))
        
        train_plus_test = " ".join([test_text, train_text])
        c_train_plus_test = len(gzip.compress(train_plus_test.encode()))
        
        ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
                / max(c_test_text, c_train_text) )
        distance_from_test_instance.append(ncd)
        
    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    
    top_k_class = list(df_train.iloc[sorted_idx[:k]]["target"].values)
    predicted_class = max(set(top_k_class), key=top_k_class.count)
    #top_k_class = df_train.iloc[sorted_idx[:k]]["target"].values
    #predicted_class = np.argmax(np.bincount(top_k_class))
    
    predicted_classes.append(predicted_class)
     
print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["target"].values))

  0%|          | 0/1514 [00:04<?, ?it/s]


KeyboardInterrupt: 

In [None]:
"""#with Tie-breaking fix (MORE ACCURACY)

from tqdm import tqdm
from collections import Counter

k = 2

predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["feature"]
    test_label = row_test[1]["target"]
    c_test_text = len(gzip.compress(test_text.encode()))
    distance_from_test_instance = []
    
    for row_train in df_train.iterrows():
        train_text = row_train[1]["feature"]
        train_label = row_train[1]["target"]
        c_train_text = len(gzip.compress(train_text.encode()))
        
        train_plus_test = " ".join([test_text, train_text])
        c_train_plus_test = len(gzip.compress(train_plus_test.encode()))
        
        ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
                / max(c_test_text, c_train_text) )
        distance_from_test_instance.append(ncd)
        
    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    top_k_class = np.array(df_train["target"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]
    
    predicted_classes.append(predicted_class)
        
print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["target"].values))"""

In [None]:
from sklearn.metrics import classification_report, accuracy_score

#true labels for the test set
true_labels = df_test['target']

# Compute the classification report
classification_report_output = classification_report(true_labels, predicted_classes)

# Print the classification report
print(classification_report_output)