In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/majid_project/doc_proj

In [None]:
"""
env: ml
pip install zeugma 
"""
%pip install transformers
%pip install mxnet
%pip install gluonnlp==0.7.0
%pip install nltk==3.2.5
%pip install zeugma
%pip install sentence_transformers
%pip install scipy==1.12
%pip install openpyxl
%pip install numpy

In [None]:
###############################################################################
# elmo

%pip install simple-elmo

from sklearn.preprocessing import FunctionTransformer
from simple_elmo import ElmoModel

elmo = ElmoModel()
"""
http://vectors.nlpl.eu/repository/#
225.zip size is 2048
209.zip size is 1024
"""
elmo.load("../data/weights/225.zip")

elmo_fn = FunctionTransformer(lambda item: elmo.get_elmo_vector_average(item))


In [1]:
import os
import json
# import io
import itertools

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import MiniBatchKMeans as KMeans

from tqdm.notebook import tqdm
import nltk

from eval import *
from utils import *
import config

nltk.download('perluniprops')
nltk.download('nonbreaking_prefixes')
nltk.download('punkt')

  from tqdm.autonotebook import tqdm, trange


[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/zhouyf/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /Users/zhouyf/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
[nltk_data] Downloading package punkt to /Users/zhouyf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


No sentence-transformers model found with name bert-base-uncased. Creating a new one with mean pooling.


[nltk_data] Downloading package perluniprops to
[nltk_data]     /Users/zhouyf/nltk_data...
[nltk_data]   Package perluniprops is already up-to-date!
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /Users/zhouyf/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!
[nltk_data] Downloading package punkt to /Users/zhouyf/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## function

In [2]:
import os
import pandas as pd
import json


def make_result(tmp_res_path, dst_file):
    onlyfiles = [f for f in os.listdir(tmp_res_path)
                 if os.path.isfile(os.path.join(tmp_res_path, f))]
    datas = []
    for f in onlyfiles:
        # Opening JSON file
        with open(os.path.join(tmp_res_path, f)) as json_file:
            d = json.load(json_file)
            d['features'] = f[:-5]
            datas.append(d)
    if not os.path.exists(os.path.dirname(dst_file)):
        os.makedirs(os.path.dirname(dst_file))
        print(f"Creating: {os.path.dirname(dst_file)}")
    pd.DataFrame(datas).to_excel(dst_file, index=False)

In [3]:
def get_tests(res=None):
    # res = """ELMO225|elmo_fn|'tweet'""".split()
    if res is None:
        res = """BERT|embedd_fn|'tweet' GPT|gpt_fn|'tweet' FASTEXT|fastText_fn|'tokens' GloVe|glove_fn|'tweet' TFIDF|tf_idf_fn|'tweet'""".split()
    tests = []
    for item in res:
        sub = []
        y = item.split('|')
        sub.append((y[0], eval(y[1]), eval(y[2])))
        tests.append(sub)
    print(f"tests:({len(tests)}): {tests} ")
    
    return tests

In [4]:
def get_data_cfg(targeted_data='google'):
    data_path, label_map = None, None
    if targeted_data == 'google':
        data_path = config.googel_news
        label_map = config.goole_label_map
    elif targeted_data == 'tweets':
        data_path = config.tweets
        label_map = config.tweet_label_map
    elif targeted_data == 'stackOverflow':
        data_path = config.stack_overflow
        label_map = config.stackOverflow_label_map
    else:
        raise ValueError(
            'targeted_data must be google ,stackOverflow or tweets')
    return data_path, label_map

In [9]:
# read data
def read_data(targeted_data='google'):
    data_path, label_map = get_data_cfg(targeted_data)
    with open(label_map) as json_file:
        labl_map = json.load(json_file)
    n_clusters = len(labl_map)
    print(f"n_clusters: {n_clusters}")

    df = pd.read_json(data_path)[:-1]
    df = df.rename({'clean_text': 'tweet'}, axis=1)
    df['Y'] = df['label'].apply(lambda x: labl_map[x])
    df['tokens'] = df["tweet"].apply(nltk.word_tokenize)
    fit_init(df['tweet'])

    print(f"samples of '{targeted_data}' data:\n{df[:5]}")

    return df, n_clusters

In [6]:
def run_make_metrics(obj_name="obj1"):
    tests = get_tests()
    for targeted_data in tqdm(["google", "tweets", "stackOverflow"]):
        print(f"\n\n******** processing target data: {targeted_data}")
        df, n_clusters = read_data(targeted_data)
        save_res_name = os.path.join(config.result_path, f"{obj_name}_{targeted_data}.xlsx")
        tmp_res_path = os.path.join(config.ROOD_DATA, "tmp", f"{obj_name}_{targeted_data}")
        fn = eval(f"make_metrics_{obj_name}")
        fn(df, n_clusters, tmp_res_path, tests)
        make_result(tmp_res_path, save_res_name)

## OBJ1: Representation

In [7]:

def make_metrics_obj1(df, n_clusters, tmp_res_path, tests):
    for test in tqdm(tests):
        test_name = '_'.join([i[0] for i in test])
        print("-" * 120 + f"\n{test_name}")

        preprocessor = ColumnTransformer(
            transformers=test,
            remainder='passthrough'
        )
        X = preprocessor.fit_transform(df)[:, :-4]

        clusterer = Pipeline(
            [
                (
                    "kmeans",
                    KMeans(
                        n_clusters=n_clusters,
                        init="k-means++",
                        n_init=50,
                        max_iter=500,
                        random_state=42,
                    ),
                ),
            ]
        )

        pipe = Pipeline(
            [
                ("clusterer", clusterer)
            ]
        )

        pipe["clusterer"]["kmeans"].n_clusters = n_clusters
        pipe.fit(X)
        metrics = get_metrics(X, pipe["clusterer"]["kmeans"].labels_, np.array(
            df['Y']), pipe["clusterer"]["kmeans"])
        
        if not os.path.exists(tmp_res_path):
            os.makedirs(tmp_res_path)
        with open(os.path.join(tmp_res_path, f"{test_name}.json"), 'w') as fp:
            json.dump(metrics, fp, indent=2)

In [10]:
run_make_metrics("obj1")

tests:(5): [[('BERT', FunctionTransformer(func=<function <lambda> at 0x35101b880>), 'tweet')], [('GPT', FunctionTransformer(func=<function <lambda> at 0x35101bd00>), 'tweet')], [('FASTEXT', FunctionTransformer(func=<function <lambda> at 0x35101bc70>), 'tokens')], [('GloVe', FunctionTransformer(func=<function <lambda> at 0x35101bac0>), 'tweet')], [('TFIDF', FunctionTransformer(func=<function <lambda> at 0x35101b910>), 'tweet')]] 


  0%|          | 0/3 [00:00<?, ?it/s]



******** processing target data: google
n_clusters: 152
samples of 'google' data:
                                                text label  \
0               centrepoint winter white gala london    65   
1                      mourinho seek killer instinct    96   
2   roundup golden globe won seduced johansson voice    72   
3  travel disruption mount storm cold air sweep s...   140   
4                   wes welker blame costly turnover    89   

                                               tweet    Y  \
0               centrepoint winter white gala london   67   
1                      mourinho seek killer instinct   72   
2       roundup golden globe seduced johansson voice   46   
3  travel disruption mount storm cold air sweep s...   87   
4                   wes welker blame costly turnover  134   

                                              tokens  
0         [centrepoint, winter, white, gala, london]  
1                 [mourinho, seek, killer, instinct]  
2  [roundup

  0%|          | 0/5 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
BERT


Batches:   0%|          | 0/348 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


------------------------------------------------------------------------------------------------------------------------
GPT
No GPU available, using the CPU instead.


  0%|          | 0/695 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


------------------------------------------------------------------------------------------------------------------------
FASTEXT
------------------------------------------------------------------------------------------------------------------------
GloVe
------------------------------------------------------------------------------------------------------------------------
TFIDF
Creating: /Users/zhouyf/Documents/data/majid/drive/MyDrive/project2/data/result


******** processing target data: tweets
n_clusters: 89
samples of 'tweets' data:
                                                text label  \
0           brain fluid buildup delay giffords rehab    37   
1  trailer talk week movie rite mechanic week opp...    14   
2  rnc appoints chairman tampa convention effort ...   100   
3         gbagbo camp futile cut ivory coast economy   110   
4  chinese president lost translation powerful le...    61   

                                               tweet   Y  \
0           brain flu

  0%|          | 0/5 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
BERT


Batches:   0%|          | 0/78 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
GPT
No GPU available, using the CPU instead.


  0%|          | 0/155 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


------------------------------------------------------------------------------------------------------------------------
FASTEXT
------------------------------------------------------------------------------------------------------------------------
GloVe
------------------------------------------------------------------------------------------------------------------------
TFIDF


******** processing target data: stackOverflow
n_clusters: 20
samples of 'stackOverflow' data:
                                             text label  \
0    fill dataset datatable linq query resultset     18   
1    best subversion clients windows vista 64bit      3   
2    best practice environment bin directory svn      3   
3  visual studio setup project per user settings      7   
4     express left join aggregate sql linq query     18   

                                          tweet   Y  \
0   fill dataset datatable linq query resultset  19   
1       best subversion client window vista bit   8   


  0%|          | 0/5 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
BERT


Batches:   0%|          | 0/513 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
GPT
No GPU available, using the CPU instead.


  0%|          | 0/1026 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


------------------------------------------------------------------------------------------------------------------------
FASTEXT
------------------------------------------------------------------------------------------------------------------------
GloVe
------------------------------------------------------------------------------------------------------------------------
TFIDF


## Obj1 PCA

In [11]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA


def make_metrics_obj1_pca(df, n_clusters, tmp_res_path, tests):
    for test in tqdm(tests):
        test_name = '_'.join([i[0] for i in test])
        print("-" * 120 + f"\n{test_name}")

        preprocessor = ColumnTransformer(
            transformers=test,
            remainder='passthrough'
        )
        X = preprocessor.fit_transform(df)[:, :-4]
        Y = np.array(df['Y'])

        prep_pca = Pipeline(
            [
                ("scaler", MinMaxScaler()),
                ("pca", PCA(random_state=42)),
            ]
        )

        clusterer = Pipeline(
            [
                (
                    "kmeans",
                    KMeans(
                        n_clusters=n_clusters,
                        init="k-means++",
                        n_init=50,
                        max_iter=500,
                        random_state=42,
                    ),
                ),
            ]
        )

        pipe = Pipeline(
            [
                ("preprocessor", prep_pca),
                ("clusterer", clusterer)
            ]
        )

        pipe["clusterer"]["kmeans"].n_clusters = n_clusters
        pipe.fit(X, Y)
        metrics = get_metrics(X, pipe["clusterer"]["kmeans"].labels_, np.array(
            df['Y']), pipe["clusterer"]["kmeans"])
        
        if not os.path.exists(tmp_res_path):
            os.makedirs(tmp_res_path)
        with open(os.path.join(tmp_res_path, f"{test_name}.json"), 'w') as fp:
            json.dump(metrics, fp, indent=2)

In [12]:
run_make_metrics("obj1_pca")

tests:(5): [[('BERT', FunctionTransformer(func=<function <lambda> at 0x35101b880>), 'tweet')], [('GPT', FunctionTransformer(func=<function <lambda> at 0x35101bd00>), 'tweet')], [('FASTEXT', FunctionTransformer(func=<function <lambda> at 0x35101bc70>), 'tokens')], [('GloVe', FunctionTransformer(func=<function <lambda> at 0x35101bac0>), 'tweet')], [('TFIDF', FunctionTransformer(func=<function <lambda> at 0x35101b910>), 'tweet')]] 


  0%|          | 0/3 [00:00<?, ?it/s]



******** processing target data: google
n_clusters: 152
samples of 'google' data:
                                                text label  \
0               centrepoint winter white gala london    65   
1                      mourinho seek killer instinct    96   
2   roundup golden globe won seduced johansson voice    72   
3  travel disruption mount storm cold air sweep s...   140   
4                   wes welker blame costly turnover    89   

                                               tweet    Y  \
0               centrepoint winter white gala london   67   
1                      mourinho seek killer instinct   72   
2       roundup golden globe seduced johansson voice   46   
3  travel disruption mount storm cold air sweep s...   87   
4                   wes welker blame costly turnover  134   

                                              tokens  
0         [centrepoint, winter, white, gala, london]  
1                 [mourinho, seek, killer, instinct]  
2  [roundup

  0%|          | 0/5 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
BERT


Batches:   0%|          | 0/348 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
GPT
No GPU available, using the CPU instead.


  0%|          | 0/695 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


------------------------------------------------------------------------------------------------------------------------
FASTEXT
------------------------------------------------------------------------------------------------------------------------
GloVe
------------------------------------------------------------------------------------------------------------------------
TFIDF


******** processing target data: tweets
n_clusters: 89
samples of 'tweets' data:
                                                text label  \
0           brain fluid buildup delay giffords rehab    37   
1  trailer talk week movie rite mechanic week opp...    14   
2  rnc appoints chairman tampa convention effort ...   100   
3         gbagbo camp futile cut ivory coast economy   110   
4  chinese president lost translation powerful le...    61   

                                               tweet   Y  \
0           brain fluid buildup delay giffords rehab  34   
1  trailer talk week movie rite mechanic 

  0%|          | 0/5 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
BERT


Batches:   0%|          | 0/78 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
GPT
No GPU available, using the CPU instead.


  0%|          | 0/155 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


------------------------------------------------------------------------------------------------------------------------
FASTEXT
------------------------------------------------------------------------------------------------------------------------
GloVe
------------------------------------------------------------------------------------------------------------------------
TFIDF


******** processing target data: stackOverflow
n_clusters: 20
samples of 'stackOverflow' data:
                                             text label  \
0    fill dataset datatable linq query resultset     18   
1    best subversion clients windows vista 64bit      3   
2    best practice environment bin directory svn      3   
3  visual studio setup project per user settings      7   
4     express left join aggregate sql linq query     18   

                                          tweet   Y  \
0   fill dataset datatable linq query resultset  19   
1       best subversion client window vista bit   8   


  0%|          | 0/5 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
BERT


Batches:   0%|          | 0/513 [00:00<?, ?it/s]

------------------------------------------------------------------------------------------------------------------------
GPT
No GPU available, using the CPU instead.


  0%|          | 0/1026 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


------------------------------------------------------------------------------------------------------------------------
FASTEXT
------------------------------------------------------------------------------------------------------------------------
GloVe
------------------------------------------------------------------------------------------------------------------------
TFIDF
