## Preprocessing

#### paste following lines into anaconda prompt (as admin) and press enter: 
#### conda install -c huggingface -c conda-forge datasets
#### conda install -c anaconda gensim


In [1]:
from datasets import load_dataset
import pandas as pd
import re, string, nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
import numpy as np

nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, FastText

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\vince\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
#load the data set: text & labels [World (0), Sports (1), Business (2), Sci/Tech (3)]
dataset = load_dataset("ag_news")

In [None]:
test_df = pd.DataFrame(dataset['test'])

from pathlib import Path
filepath = Path(r"C:\Users\vince\Desktop\abcd\out.csv")  
filepath.parent.mkdir(parents=True, exist_ok=True)  
test_df.to_csv(filepath)

# import os  
# os.makedirs('folder/subfolder', exist_ok=True)  
# df.to_csv('folder/subfolder/out.csv')  

# Train test split and class distribution

In [None]:
import matplotlib.pyplot as plt

# train test split
train_df = pd.DataFrame(dataset['train'])
test_df = pd.DataFrame(dataset['test'])


# checking class distribution
plt.figure(figsize=(10,5))

## train data
plt.subplot(1,2,1)
train_df_target = train_df['label']
class_dist = pd.Series(train_df_target).value_counts()
plt.title('train_df')
plt.bar(class_dist.index, class_dist)
plt.tight_layout()

## test data
plt.subplot(1,2,2)
test_df_target = test_df['label']
class_dist = pd.Series(test_df_target).value_counts()
plt.title('test_df')
plt.bar(class_dist.index, class_dist)
plt.tight_layout()

plt.show()

In [None]:
#initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
wnl = WordNetLemmatizer()


def preprocessing(text):
    
    # convert to lowercase and remove spaces at beginning and ending
    text = text.lower().strip()
    
    # remove punctuation
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)
    
    # remove html code
    text= re.sub('<.*?>', '', text) 
    
    # remove special characters
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    
    # remove digits
    text = re.sub(r'\d',' ',text)
    
    # replace multiple whitespaces with one
    text = re.sub('\s+', ' ', text)
    
    # stop word removal
    clean_text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # tonkenize & lemmatize
    word_pos_tags = nltk.pos_tag(word_tokenize(clean_text)) # -> list of tuples (word, pos_tag) [('computer', 'NN'), ('word', 'tag')]
    lem_text = ' '.join([wnl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for tag in word_pos_tags])

    return lem_text

 
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        # default pos
        return wordnet.NOUN
    


In [None]:
# execute preprocessing for training set
train_df['text'] = train_df['text'].apply(lambda x: preprocessing(x))
train_df.to_csv('training_data.csv', sep=';', encoding='utf-8', index=False)
train_df.head()

## Vectorization

In [2]:
# read in preprocessed training data if necessary
train_df = pd.read_csv('preprocessed_training_data.csv', sep=';', encoding='utf-8')
#for quick-testing with small memory
train_df = train_df.sample(frac=0.01, random_state=42)
# for word embedding models
train_tokens = [word_tokenize(text) for text in train_df['text']]

### Count vectors and Tf-idf vectors

In [3]:
tfidf_vectorizer = TfidfVectorizer()
count_vectorizer = CountVectorizer()
tfidf_vectors = tfidf_vectorizer.fit_transform(train_df['text'])
count_vectors = count_vectorizer.fit_transform(train_df['text'])

### Word2Vec SkipGram & CBOW

In [23]:
w2v_cbow = Word2Vec(train_tokens, min_count=2, window=5)
w2v_skipg = Word2Vec(train_tokens, min_count=2, window=5, sg = 1)

In [28]:
# returns mean w2v vector for list of specified words
def get_embedding(model, text):
    existing_words = [word for word in text if word in model.wv.vocab]
    if existing_words:
        embedding = np.zeros((len(existing_words), model.vector_size), dtype=np.float32)
        for i, w in enumerate(existing_words):
                embedding[i] = model.wv[w]
        return np.mean(embedding, axis=0)
    else:
        return np.zeros(model.vector_size)

In [29]:
# get mean vector for each article description for both models
embeddings_w2v_cbow = np.array([get_embedding(w2v_cbow, text) for text in train_tokens])
embeddings_w2v_skipg = np.array([get_embedding(w2v_skipg, text) for text in train_tokens])

AttributeError: The vocab attribute was removed from KeyedVector in Gensim 4.0.0.
Use KeyedVector's .key_to_index dict, .index_to_key list, and methods .get_vecattr(key, attr) and .set_vecattr(key, attr, new_val) instead.
See https://github.com/RaRe-Technologies/gensim/wiki/Migrating-from-Gensim-3.x-to-4

### fastText

In [31]:
fasttext = FastText(window=5, min_count=2)
fasttext.build_vocab(sentences=train_tokens)
fasttext.train(sentences=train_tokens, total_examples=len(train_tokens), epochs=10)

TypeError: Either one of corpus_file or corpus_iterable value must be provided

In [None]:
embeddings_fasttext = np.array([get_embedding(fasttext, text) for text in train_tokens])

## Classification

### Embeddings Dictionary

In [57]:
from sklearn.neighbors import KNeighborsClassifier

#TODO: add the other matrices, when done
embeddings = {
    'tfidf_vectors': tfidf_vectors,
    'count_vectors': count_vectors
}

### Hyperparameter Setup

In [58]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

def KNeighbors_hyperParameterTune(matrix):
    
    knn_estimator = KNeighborsClassifier()
    
    decisionTree_parameters = {
    'n_neighbors': range(2, 9)
    }
    
    # specify the cross validation
    stratified_10_fold_cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    
    # create the grid search instance
    grid_search_estimator = GridSearchCV(knn_estimator, decisionTree_parameters, scoring='accuracy', cv=stratified_10_fold_cv, return_train_score=False)
    
    # run the grid search
    grid_search_estimator.fit(matrix.toarray(), train_df['label'])
    
    # print the results of all hyper-parameter combinations
    results = pd.DataFrame(grid_search_estimator.cv_results_)
    display(results)
    
    # print the best parameter setting
    print("KNeigborsClassifier: " + key)
    print("best score is {} with params {}".format(grid_search_estimator.best_score_, grid_search_estimator.best_params_))
    
    return grid_search_estimator.best_score_

In [60]:
best_score = 0
for key in embeddings:
    
    score = KNeighbors_hyperParameterTune(embeddings[key])
    
    estimators = {
        'KNeighborsClassifier': score
    }
    
    best_estimator_score = 0
    for estimator in estimators:
        estimator_score = estimators[estimator]
        if estimator_score > best_estimator_score:
            best_estimator_score = estimator_score
            best_estimator = estimator
    if score > best_score:
        best_key = key
        best_score = score
        
print("\nThe best performance is reached with the estimator " + best_estimator + " and the embedding " + best_key + " with an accuracy of " + str(best_score) )

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.033577,0.008574,0.050535,0.008889,2,{'n_neighbors': 2},0.733333,0.725,0.691667,0.691667,0.733333,0.7,0.741667,0.658333,0.708333,0.75,0.713333,0.026926,7
1,0.032132,0.00395,0.051711,0.010885,3,{'n_neighbors': 3},0.766667,0.775,0.75,0.766667,0.766667,0.725,0.758333,0.733333,0.766667,0.783333,0.759167,0.01726,6
2,0.032814,0.004681,0.051563,0.010003,4,{'n_neighbors': 4},0.808333,0.725,0.741667,0.808333,0.791667,0.725,0.766667,0.741667,0.75,0.816667,0.7675,0.034044,5
3,0.031401,0.007004,0.049461,0.009694,5,{'n_neighbors': 5},0.791667,0.75,0.716667,0.825,0.8,0.741667,0.783333,0.808333,0.8,0.825,0.784167,0.034651,3
4,0.028126,0.006247,0.048439,0.00841,6,{'n_neighbors': 6},0.791667,0.791667,0.716667,0.791667,0.808333,0.766667,0.75,0.758333,0.775,0.858333,0.780833,0.035949,4
5,0.029435,0.0084,0.051205,0.015122,7,{'n_neighbors': 7},0.808333,0.783333,0.75,0.8,0.8,0.758333,0.783333,0.783333,0.791667,0.866667,0.7925,0.03015,2
6,0.028128,0.006249,0.053478,0.009867,8,{'n_neighbors': 8},0.808333,0.766667,0.791667,0.816667,0.808333,0.783333,0.791667,0.791667,0.758333,0.866667,0.798333,0.028577,1


KNeigborsClassifier: tfidf_vectors
best score is 0.7983333333333335 with params {'n_neighbors': 8}


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_neighbors,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.023821,0.008255,0.074302,0.009827,2,{'n_neighbors': 2},0.366667,0.375,0.383333,0.333333,0.358333,0.383333,0.416667,0.375,0.4,0.375,0.376667,0.021344,2
1,0.015625,2.4e-05,0.073446,0.012217,3,{'n_neighbors': 3},0.3,0.266667,0.291667,0.275,0.333333,0.316667,0.308333,0.341667,0.533333,0.3,0.326667,0.072342,4
2,0.015628,5e-06,0.072847,0.011317,4,{'n_neighbors': 4},0.291667,0.25,0.316667,0.291667,0.308333,0.291667,0.283333,0.35,0.391667,0.291667,0.306667,0.037231,7
3,0.022567,0.007378,0.08444,0.011462,5,{'n_neighbors': 5},0.291667,0.25,0.3,0.275,0.291667,0.291667,0.3,0.483333,0.466667,0.275,0.3225,0.077643,5
4,0.023582,0.006882,0.080065,0.009572,6,{'n_neighbors': 6},0.283333,0.25,0.283333,0.258333,0.275,0.283333,0.283333,0.516667,0.5,0.266667,0.32,0.094883,6
5,0.021858,0.005354,0.083294,0.007701,7,{'n_neighbors': 7},0.366667,0.275,0.325,0.291667,0.308333,0.325,0.341667,0.5,0.508333,0.341667,0.358333,0.07701,3
6,0.022247,0.007882,0.076744,0.005108,8,{'n_neighbors': 8},0.45,0.316667,0.366667,0.341667,0.366667,0.383333,0.4,0.466667,0.491667,0.425,0.400833,0.053677,1


KNeigborsClassifier: count_vectors
best score is 0.4008333333333334 with params {'n_neighbors': 8}

The best performance is reached with the estimator KNeighborsClassifier and the embedding tfidf_vectors with an accuracy of 0.7983333333333335


In [5]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.naive_bayes import MultinomialNB

# TODO: print(type(tfidf_vectors))
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

cross_val_acc = cross_val_score(MultinomialNB(), tfidf_vectors.toarray(), train_df['label'], scoring='accuracy', cv=cv)
print(cross_val_acc.mean())

MemoryError: Unable to allocate 30.9 GiB for an array with shape (80000, 51879) and data type float64