In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('../datasets/ecommerceDataset.csv', names =['target', 'feature'])
df.head(2)

Unnamed: 0,target,feature
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."


In [2]:
df.dropna(inplace=True)

## Now we preprocess the data.

- convert_to_lowercase:  converts the text to lowercase 
- remove_whitespaces: removes unnecessary empty whitespaces from the text 
- remove_punctuations: removes punctuations but we keep the apostrophes 
- remove_html: removes html links from the text 
- remove_http: removes http links from the text 
- remove_stopwords: removing stop words since they have no impact on the classification procedure 
- text_stemmer: converting the words to their root form 
- discard_non_alpha: discarding non-alphabetic words because they create unnecessary diversions 

In [3]:
"""
Now we do Data Preprocessing.
"""
import string, re, nltk
from string import punctuation
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

regexp = RegexpTokenizer("[\w']+")

def convert_to_lowercase(text):
    return text.lower()
def remove_whitespace(text):
    return text.strip()
def remove_punctuation(text):
    punct_str = string.punctuation
    punct_str = punct_str.replace("'", "") 
    return text.translate(str.maketrans("", "", punct_str))
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)
def remove_http(text):
    http = "https?://\S+|www\.\S+" 
    pattern = r"({})".format(http) 
    return re.sub(pattern, "", text)
# Stopwords
stops = stopwords.words("english") 
addstops = ["among", "onto", "shall", "thrice", "thus", "twice", "unto", "us", "would"] 
allstops = stops + addstops
def remove_stopwords(text):
    return " ".join([word for word in regexp.tokenize(text) if word not in allstops])
stemmer = PorterStemmer()
def text_stemmer(text):
    text_stem = " ".join([stemmer.stem(word) for word in regexp.tokenize(text)])
    return text_stem
def discard_non_alpha(text):
    word_list_non_alpha = [word for word in regexp.tokenize(text) if word.isalpha()]
    text_non_alpha = " ".join(word_list_non_alpha)
    return text_non_alpha


# Integration process 

- We integrate the text normalization processes in appropriate order. We also converted the text into one line and removed square brackets.

In [4]:
def text_normalizer(text):
    text = convert_to_lowercase(text)
    text = remove_whitespace(text)
    text = re.sub('\n' , '', text)
    text = re.sub('\[.*?\]', '', text) 
    text = remove_http(text)
    text = remove_punctuation(text)
    text = remove_html(text)
    text = remove_stopwords(text)
    text = discard_non_alpha(text)
    return text

In [5]:
df['feature'] = df['feature'].apply(text_normalizer)
df['feature'][0]

'paper plane design framed wall hanging motivational office decor art prints x inch set painting made synthetic frame uv textured print gives multi effects attracts towards special series paintings makes wall beautiful gives royal touch painting ready hang proud possess unique painting niche apart use modern efficient printing technology prints inks precision epson roland hp printers innovative hd printing technique results durable spectacular looking prints highest last lifetime print solely topnotch inks achieve brilliant true colours due high level uv resistance prints retain beautiful colours many years add colour style living space digitally printed painting pleasure eternal blissso bring home elegant print lushed rich colors makes nothing sheer elegance friends familyit treasured forever whoever lucky recipient liven place intriguing paintings high definition hd graphic digital prints home office room'

In [6]:

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

## Calculating the compression ratio $CR$

- method calculate_compression_ratio:
  param text: text that is used to calculate the compression ratio
  - the compression ratio is computed by: original encoded text length devided by the gzip compressed text length

In [8]:
import gzip
# Calculate compression ratio
def calculate_compression_ratio(text):
    compressed = len(gzip.compress(text.encode()))
    original = len(text.encode())
    compression_ratio = original / compressed
    return compression_ratio

# Print compression ratios for train and test sets
train_compression_ratio = calculate_compression_ratio(" ".join(df_train['feature']))
test_compression_ratio = calculate_compression_ratio(" ".join(df_test['feature']))

print("Compression Ratio - Train Set:", train_compression_ratio)
print("Compression Ratio - Test Set:", test_compression_ratio)


Compression Ratio - Train Set: 2.8164242110043336
Compression Ratio - Test Set: 2.820859067511082


## Calculating numerical characteristics about the dataset

In [10]:
#Statistical data about the train and test sets
# N - Number of training and test set examples
N_train = len(df_train)
N_test = len(df_test)

# C - Number of classes
C = df['target'].nunique()

#Calculate average number of words (W) and characters (L) in each example
#text is tokenized by spaces
train_word_counts = df_train['feature'].apply(lambda x: len(x.split()))
test_word_counts = df_test['feature'].apply(lambda x: len(x.split()))
train_char_counts = df_train['feature'].apply(lambda x: len(x))
test_char_counts = df_test['feature'].apply(lambda x: len(x))
W_train = train_word_counts.mean()
W_test = test_word_counts.mean()
L_train = train_char_counts.mean()
L_test = test_char_counts.mean()

# V - Vocabulary size
all_text = " ".join(df['feature'])
vocabulary = set(all_text.split())
V = len(vocabulary)

# Print the statistics
print("N - Number of training examples:", N_train)
print("N - Number of test examples:", N_test)
print("C - Number of classes:", C)
print("W - Average number of words in each example (Train):", W_train)
print("W - Average number of words in each example (Test):", W_test)
print("L - Average number of characters in each example (Train):", L_train)
print("L - Average number of characters in each example (Test):", L_test)
print("V - Vocabulary size:", V)

N - Number of training examples: 40339
N - Number of test examples: 10085
C - Number of classes: 4
W - Average number of words in each example (Train): 70.23545452291827
W - Average number of words in each example (Test): 70.15815567674764
L - Average number of characters in each example (Train): 513.0067428543098
L - Average number of characters in each example (Test): 511.7854238968765
V - Vocabulary size: 86662


## Applied gzip compressor-based text classification on the dataset

- for each compressed test set record, join with compressed training record & compute the distance between compressed test record and concatenated train + test record
- $kNN$ majority vote at the end (get most freuquent class among top k neighbors)

In [None]:
#normale Gzip-Klassifikationsvariante

from tqdm import tqdm
from collections import Counter

k = 2

predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["feature"]
    test_label = row_test[1]["target"]
    c_test_text = len(gzip.compress(test_text.encode()))
    distance_from_test_instance = []
    
    for row_train in df_train.iterrows():
        train_text = row_train[1]["feature"]
        train_label = row_train[1]["target"]
        c_train_text = len(gzip.compress(train_text.encode()))
        
        train_plus_test = " ".join([test_text, train_text])
        c_train_plus_test = len(gzip.compress(train_plus_test.encode()))
        
        ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
                / max(c_test_text, c_train_text) )
        distance_from_test_instance.append(ncd)
        
    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    
    top_k_class = list(df_train.iloc[sorted_idx[:k]]["label"].values)
    predicted_class = max(set(top_k_class), key=top_k_class.count)
    #top_k_class = df_train.iloc[sorted_idx[:k]]["target"].values
    #predicted_class = np.argmax(np.bincount(top_k_class))
    
    predicted_classes.append(predicted_class)
     
print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["target"].values))

 11%|█         | 1091/10085 [1:50:43<15:12:46,  6.09s/it]


KeyboardInterrupt: 

## Applied the tie-breaking-fix (tbf) variant of the gzip compressor based text classification method

- we improved tie-breaking using a Counter which selects the first label in case of a tie. If the labels are sorted by the distance we ensure it's picking the closest neighbor in case of a tie.

In [None]:
"""#with Tie-breaking fix (MORE ACCURACY)

from tqdm import tqdm
from collections import Counter

k = 2

predicted_classes = []

for row_test in tqdm(df_test.iterrows(), total=df_test.shape[0]):
    test_text = row_test[1]["feature"]
    test_label = row_test[1]["target"]
    c_test_text = len(gzip.compress(test_text.encode()))
    distance_from_test_instance = []
    
    for row_train in df_train.iterrows():
        train_text = row_train[1]["feature"]
        train_label = row_train[1]["target"]
        c_train_text = len(gzip.compress(train_text.encode()))
        
        train_plus_test = " ".join([test_text, train_text])
        c_train_plus_test = len(gzip.compress(train_plus_test.encode()))
        
        ncd = ( (c_train_plus_test - min(c_train_text, c_test_text))
                / max(c_test_text, c_train_text) )
        distance_from_test_instance.append(ncd)
        
    sorted_idx = np.argsort(np.array(distance_from_test_instance))
    top_k_class = np.array(df_train["target"])[sorted_idx[:k]]
    predicted_class = Counter(top_k_class).most_common()[0][0]
    
    predicted_classes.append(predicted_class)
        
print("Accuracy:", np.mean(np.array(predicted_classes) == df_test["target"].values))"""

In [None]:
from sklearn.metrics import classification_report, accuracy_score

#true labels for the test set
true_labels = df_test['target']

# Compute the classification report
classification_report_output = classification_report(true_labels, predicted_classes)

# Print the classification report
print(classification_report_output)