## Preprocessing

#### paste following lines into anaconda prompt (as admin) and press enter: 
#### conda install -c huggingface -c conda-forge datasets
#### conda install -c anaconda gensim


In [54]:
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re, string, nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec, FastText
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# Train test split and class distribution

In [None]:
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# train-test split
dataset = load_dataset("ag_news")
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])
del dataset

def class_distribution():
    
    # checking class distribution
    plt.figure(figsize=(10,5))

    ## train data
    plt.subplot(1,2,1)
    train_df_target = train_df['label']
    class_dist = pd.Series(train_df_target).value_counts()
    plt.title('train_df')
    plt.bar(class_dist.index, class_dist)
    plt.tight_layout()

    ## test data
    plt.subplot(1,2,2)
    test_df_target = test_df['label']
    class_dist = pd.Series(test_df_target).value_counts()
    plt.title('test_df')
    plt.bar(class_dist.index, class_dist)
    plt.tight_layout()

    plt.show()

class_distribution()

In [None]:
#initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()


def preprocessing(text):
    
    # convert to lowercase and remove spaces at beginning and ending
    text = text.lower().strip()
    
    # remove punctuation
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    
    # remove html code
    text= re.sub('<.*?>', '', text) 
    
    # remove special characters
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    
    # remove digits
    text = re.sub(r'\d',' ',text)
    
    # replace multiple whitespaces with one
    text = re.sub('\s+', ' ', text)
    
    # stop word removal
    clean_text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # tonkenize & lemmatize
    word_pos_tags = nltk.pos_tag(word_tokenize(clean_text)) # -> list of tuples (word, pos_tag) [('computer', 'NN'), ('word', 'tag')]
    lem_text = ' '.join([wnl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for tag in word_pos_tags])

    return lem_text

 
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        # default pos
        return wordnet.NOUN
    


In [None]:
# execute preprocessing for training set
train_df['text'] = train_df['text'].apply(lambda x: preprocessing(x))
train_df.to_csv('training_data.csv', sep=';', encoding='utf-8', index=False)
train_df.head()

## Vectorization

In [2]:
# read in preprocessed training data if necessary
train_df = pd.read_csv('preprocessed_training_data.csv', sep=';', encoding='utf-8')
#for quick-testing with small memory
train_df = train_df.sample(frac=0.01, random_state=42)
# for word embedding models
train_tokens = [word_tokenize(text) for text in train_df['text']]

### Count vectors and Tf-idf vectors

In [3]:
tfidf_vectorizer = TfidfVectorizer()
count_vectorizer = CountVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(train_df['text'])
count_vectors = count_vectorizer.fit_transform(train_df['text'])

### Word2Vec SkipGram & CBOW

In [None]:
w2v_cbow = Word2Vec(train_tokens, min_count=2, window=5)
w2v_skipg = Word2Vec(train_tokens, min_count=2, window=5, sg = 1)

In [None]:
# returns mean w2v vector for list of specified words
def get_embedding(model, text):
    existing_words = [word for word in text if word in model.wv.vocab]
    if existing_words:
        embedding = np.zeros((len(existing_words), model.vector_size), dtype=np.float32)
        for i, w in enumerate(existing_words):
                embedding[i] = model.wv[w]
        return np.mean(embedding, axis=0)
    else:
        return np.zeros(model.vector_size)

In [None]:
# get mean vector for each article description for both models
embeddings_w2v_cbow = np.array([get_embedding(w2v_cbow, text) for text in train_tokens])
embeddings_w2v_skipg = np.array([get_embedding(w2v_skipg, text) for text in train_tokens])

### fastText

In [None]:
fasttext = FastText(vector_size=300, window=5, min_count=2)
fasttext.build_vocab(corpus_iterable=train_tokens)
fasttext.train(corpus_iterable=train_tokens, total_examples=len(train_tokens), epochs=10)

In [None]:
embeddings_fasttext = np.array([get_embedding(fasttext, text) for text in train_tokens])

## Classification

### Hyperparameter Tuning

In [41]:
# hyperparameter tuning
def param_search(vector_matrix, vector_matrix_name, estimator):
    
    estimator_name = estimator['name']
    parameters = estimator['parameters']
    estimator = estimator['estimator']
    
    # specify the cross validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # specify the nested cross validation
    nested_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    # create the grid search instance
    search = GridSearchCV(estimator, parameters, scoring='accuracy', cv=nested_cv, n_jobs=-1)
    
    # run the grid search
    cross_val_acc = cross_val_score(search, vector_matrix.toarray(), train_df['label'], scoring='accuracy', cv=cv)
    accuracy = cross_val_acc.mean()
    
    # print the best parameter setting
    print("Classifier: {}".format(estimator_name))
    print("Vectorizer: {}".format(vector_matrix_name))
    print("Accuracy: {}".format(accuracy))
    print()
    
    return accuracy, search

In [61]:
# tune for the best vectorizer-estimator combination
def hyper_tune():
    
    kNeighbors_parameters = {
        'n_neighbors': range(2, 9)
    }
    
    decisionTree_parameters = {
        'criterion':['gini', 'entropy'], 
        'max_depth':[ 2, 3, 4, 5, None]
    }
    
    gaussianNB_parameters = {
        'var_smoothing': [0.00000001, 0.000000001, 0.0000001]
    }
    
    svc_parameters = {
        'gamma': ['auto'],
        'kernel': ['rbf']
    }
    
    #TODO: add the other matrices, when done
    vectorizers = {
        'tfidf_vectors': tfidf_vectors,
        'count_vectors': count_vectors
    }
    
    #TODO: add more estimators
    estimators = {
        'KNeighborsClassifier': { 'name': 'KNeighborsClassifier', 'estimator': KNeighborsClassifier(), 'parameters': kNeighbors_parameters },
#         'DecisionTreeClassifier': { 'name': 'DecisionTreeClassifier', 'estimator': DecisionTreeClassifier(), 'parameters': decisionTree_parameters },
#         'GaussianNB': { 'name': 'GaussianNB', 'estimator': GaussianNB(), 'parameters': gaussianNB_parameters },
        'SVC': { 'name': 'SVC', 'estimator': SVC(), 'parameters': svc_parameters }
    }
    
    best_score = 0
    for vectorizer in vectorizers:
        for estimator in estimators:
            
            score, model = param_search( vectorizers[vectorizer], vectorizer, estimators[estimator] )
            if score > best_score:
                best_score = score
                best_estimator = estimator
                best_vectorizer = vectorizer
                best_model = model

    print("\nThe best performance is reached with the estimator " + best_estimator + " and the vectorizer " + best_vectorizer + " with an accuracy of " + str(best_score) )
    return best_model
model = hyper_tune()

Classifier: KNeighborsClassifier
Vectorizer: tfidf_vectors
Accuracy: 0.7808333333333334

Classifier: SVC
Vectorizer: tfidf_vectors
Accuracy: 0.2741666666666667

Classifier: KNeighborsClassifier
Vectorizer: count_vectors
Accuracy: 0.4125

Classifier: SVC
Vectorizer: count_vectors
Accuracy: 0.2741666666666667


The best performance is reached with the estimator KNeighborsClassifier and the vectorizer tfidf_vectors with an accuracy of 0.7808333333333334
