In [8]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import spacy
from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from sklearn.metrics import silhouette_score
import cloudpickle
from sklearn.decomposition import PCA, TruncatedSVD
from numpy.linalg import norm
import warnings
warnings.filterwarnings("ignore")

In [None]:
# data = pd.read_json(os.path.join("..", "data", "News_Category_Dataset_v3.json"), lines=True)

In [None]:
# data.to_parquet(os.path.join("..", "data", "text_data.parquet"))

In [None]:
# data['description'] = data['headline'] + " " + data['short_description']

In [None]:
# data = data['description'].copy()

In [None]:
# data

In [None]:
# data.to_csv(os.path.join("..", "data", "news_text.csv"), index=False)

In [None]:
class TextPreprocessor:
    def __init__(self, lemmatize: bool = True, remove_punct: bool = True, remove_digits: bool = True,
                 remove_stop_words: bool = True,
                 remove_short_words: bool = True, minlen: int = 1, maxlen: int = 1, top_p: float = None,
                 bottom_p: float = None):
        self.lemmatize = lemmatize
        self.remove_punct = remove_punct
        self.remove_digits = remove_digits
        self.remove_stop_words = remove_stop_words
        self.remove_short_words = remove_short_words
        self.minlen = minlen
        self.maxlen = maxlen
        self.top_p = top_p
        self.bottom_p = bottom_p
        self.words_to_remove = []
        self.stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",
                           "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                           'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them',
                           'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll",
                           'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
                           'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
                           'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against',
                           'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
                           'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once',
                           'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
                           'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than',
                           'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've",
                           'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't",
                           'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven',
                           "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn',
                           "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't",
                           'won', "won't", 'wouldn', "wouldn't"]

    @staticmethod
    def __remove_double_whitespaces(string: str):
        return " ".join(string.split())

    @staticmethod
    def __lemmatize(string_series: pd.Series):
        nlp = spacy.load(os.path.join("..", "en_core_web_sm-3.4.1"))

        def str_lemmatize(string: str):
            doc = nlp(string)
            return " ".join([token.lemma_ for token in doc])

        return string_series.map(str_lemmatize)

    def __remove_punct(self, string_series: pd.Series):
        clean_string_series = string_series.str.replace(pat=f'[{string.punctuation}]', repl=" ", regex=True).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    def __remove_digits(self, string_series: pd.Series):
        clean_string_series = string_series.str.replace(pat=r'\d', repl=" ", regex=True).copy()
        return clean_string_series.map(self.__remove_double_whitespaces)

    @staticmethod
    def __remove_short_words(string_series: pd.Series, minlen: int = 1, maxlen: int = 1):
        clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split() if
                                                                         (len(word) > maxlen) or (len(word) < minlen)]))
        return clean_string_series

    def __remove_stop_words(self, string_series: pd.Series):
        def str_remove_stop_words(string: str):
            stops = self.stop_words
            return " ".join([token for token in string.split() if token not in stops])

        return string_series.map(str_remove_stop_words)

    def __remove_top_bottom_words(self, string_series: pd.Series, top_p: int = None,
                                  bottom_p: int = None, dataset: str = 'train'):
        if dataset == 'train':
            if top_p is None:
                top_p = 0
            if bottom_p is None:
                bottom_p = 0

            if top_p > 0 or bottom_p > 0:
                word_freq = pd.Series(" ".join(string_series).split()).value_counts()
                n_words = len(word_freq)

            if top_p > 0:
                self.words_to_remove.extend([*word_freq.index[: int(np.ceil(top_p * n_words))]])

            if bottom_p > 0:
                self.words_to_remove.extend([*word_freq.index[-int(np.ceil(bottom_p * n_words)):]])

        if len(self.words_to_remove) == 0:
            return string_series
        else:
            clean_string_series = string_series.map(lambda string: " ".join([word for word in string.split()
                                                                             if word not in self.words_to_remove]))
            return clean_string_series

    def preprocess(self, string_series: pd.Series, dataset: str = "train"):
        string_series = string_series.str.lower().copy()
        if self.lemmatize:
            string_series = self.__lemmatize(string_series=string_series)
        if self.remove_punct:
            string_series = self.__remove_punct(string_series=string_series)
        if self.remove_digits:
            string_series = self.__remove_digits(string_series=string_series)
        if self.remove_stop_words:
            string_series = self.__remove_stop_words(string_series=string_series)
        if self.remove_short_words:
            string_series = self.__remove_short_words(string_series=string_series,
                                                      minlen=self.minlen,
                                                      maxlen=self.maxlen)
        string_series = self.__remove_top_bottom_words(string_series=string_series,
                                                       top_p=self.top_p,
                                                       bottom_p=self.bottom_p, dataset=dataset)

        string_series = string_series.str.strip().copy()
        string_series.replace(to_replace="", value="this is an empty message", inplace=True)

        return string_series

In [None]:
class TextVectorizer:
    def __init__(self, use_w2v: bool = True):
        self.use_w2v = use_w2v
        self.tfidf = None

    @staticmethod
    def __get_doc2vec(x: pd.Series):
        google_w2v = KeyedVectors.load(os.path.join("..", "google_word2vec", "google_w2v_100k.bin"),
                                       mmap='r')
        corpus_w2v = []
        for doc in x:
            doc_w2v = []
            for token in doc.split():
                try:
                    doc_w2v.append(list(google_w2v[token]))
                except:
                    pass
            if len(doc_w2v) != 0:
                doc_w2v = np.array(doc_w2v)
                if doc_w2v.ndim == 1:
                    corpus_w2v.append(doc_w2v)
                else:
                    corpus_w2v.append(doc_w2v.mean(axis=0))
            else:
                corpus_w2v.append(np.array([0] * 300))
        return np.array(corpus_w2v)

    def vectorize(self, x: pd.Series, dataset: str = "train"):
        x = x.copy()
        if not self.use_w2v:
            if dataset == "train":
                self.tfidf = TfidfVectorizer()
                self.tfidf.fit(x)
            x = self.tfidf.transform(x).copy()
        else:
            x = self.__get_doc2vec(x).copy()
        return x


In [None]:
class DimensionalityReduction:
    def __init__(self, n_components: int):
        self.n_components = n_components
        self.pca = None
        self.tsvd = None
        self.minmax = None
        self.maxabs = None

    def reduce_dimensions(self, x, dataset: str = "train"):
        x = x.copy()
        if not isinstance(x, pd.DataFrame) and not isinstance(x, np.ndarray):
            if dataset == "train":
                self.maxabs = MaxAbsScaler()
                self.maxabs.fit(x)
                x = self.maxabs.transform(x).copy()
                self.tsvd = TruncatedSVD(n_components=self.n_components, random_state=42)
                self.tsvd.fit(x)
            if dataset != "train":
                x = self.maxabs.transform(x).copy()
            x = self.tsvd.transform(x).copy()
        else:
            if dataset == "train":
                self.minmax = MinMaxScaler()
                self.minmax.fit(x)
                x = self.minmax.transform(x).copy()
                self.pca = PCA(n_components=self.n_components, random_state=42)
                self.pca.fit(x)
            if dataset != "train":
                x = self.minmax.transform(x).copy()
            x = self.pca.transform(x).copy()
        return x

In [None]:
class Cluster():
    def __init__(self, min_cluster:int = 2, max_cluster:int = 10, random_state:int = 42):
        self.scaler = None
        self.k = None
        self.min_cluster = min_cluster
        self.max_cluster = max_cluster
        self.kmeans_model = None
        self.random_state = random_state
        
    def __fit_scaler(self, X):
        self.scaler = MinMaxScaler()
        self.scaler.fit(X)
        
    def __find_best_k(self, X):
        self.__fit_scaler(X)
        X_scaled = self.scaler.transform(X)
        silhouette_scores = []
        for k in range(self.min_cluster, self.max_cluster + 1):
            kmeans = KMeans(n_clusters=k, random_state=self.random_state)
            kmeans.fit(X_scaled)
            labels = kmeans.labels_
            silhouette_scores.append(silhouette_score(X=X_scaled, labels=labels, random_state=self.random_state))
        self.k = self.min_cluster + np.argmax(silhouette_scores)
        
    def fit(self, X):
        self.__find_best_k(X)
        self.kmeans_model = KMeans(n_clusters=self.k, random_state=self.random_state)
        X_scaled = self.scaler.transform(X)
        self.kmeans_model.fit(X_scaled)
        
        
    def predict(self, X):
        X_scaled = self.scaler.transform(X)
        prediction = self.kmeans_model.predict(X_scaled)
        return prediction
    
    def fit_predict(self, X):
        self.fit(X)
        self.predict(X)

In [None]:
# text_preprocess = TextPreprocessor(lemmatize=True)
# preprocessed_text = text_preprocess.preprocess(data)
# preprocessed_text.to_csv(os.path.join("..", "data", "preprocessed_text.csv"), index=False)
# with open(os.path.join("..", "models", "preprocessor.bin"), "wb") as file:
    # cloudpickle.dump(text_preprocess, file)

In [None]:
# preprocessed_text = pd.read_csv(os.path.join("..", "data", "preprocessed_text.csv")).squeeze("columns")
# preprocessed_text

In [None]:
# vectorizer = TextVectorizer()
# vectorized_text = vectorizer.vectorize(preprocessed_text)
# pd.DataFrame(vectorized_text).to_csv(os.path.join("..", "data", "vectorized_text.csv"), index=False)
# with open(os.path.join("..", "models", "vectorizer.bin"), "wb") as file:
    # cloudpickle.dump(vectorizer, file)

In [None]:
# vectorized_text = pd.read_csv(os.path.join("..", "data", "vectorized_text.csv"))
# vectorized_text.head()

In [None]:
# dimensionality_reductionensionality_reduction = DimensionalityReduction(n_components=220)
# pca_text = dimensionality_reduction.reduce_dimensions(vectorized_text)
# plt.plot(np.cumsum(dimensionality_reduction.pca.explained_variance_ratio_))

In [None]:
# np.cumsum(dimensionality_reduction.pca.explained_variance_ratio_)[-1] # explained variance ratio of 93.5

In [None]:
# pd.DataFrame(pca_text).to_csv(os.path.join("..", "data", "pca_text.csv"), index=False)
# with open(os.path.join("..", "models", "pca.bin"), "wb") as file:
#     cloudpickle.dump(dimensionality_reduction, file)

In [None]:
# pca_text = pd.read_parquet(os.path.join("..", "data", "pca_text.parquet"))

In [None]:
# cluster = Cluster(min_cluster=42, max_cluster=42)
# cluster.fit(pca_text)

In [None]:
# with open(os.path.join("..", "models", "clustering.bin"), "wb") as file:
#     cloudpickle.dump(cluster, file)

In [None]:
# clusters = cluster.predict(pca_text)
# len(clusters), len(pca_text)

In [None]:
# len(set(clusters))

In [None]:
# pd.Series(clusters).sort_values().unique()

In [None]:
# text_data = pd.read_parquet(os.path.join("..", "data", "text_data.parquet"))

In [None]:
# len(text_data)

In [None]:
# text_data['cluster'] = clusters

In [None]:
# len(clusters)

In [None]:
# clusters

In [None]:
# text_data.to_parquet(os.path.join("..", "data", "text_data.parquet"))

In [None]:
# text_data['cluster'].unique()

In [None]:
# text_data.loc[text_data['cluster'] == 1]