In [2]:
import pandas
import pickle

import pandas as pd
import seaborn as sns
import platform
import re
from tqdm import tqdm

In [3]:
YEWNO_DATA_PATH = '/Users/khoanguyen/Workspace/dataset/Yewno/ms_yewno_2020.pickle'
EDF_DATA_PATH = '/Users/khoanguyen/Workspace/dataset/edf_msft/MSFT_Jul2019_2020_linebreak'
FILTERED_YEWNO_PATH = '/Users/khoanguyen/Workspace/dataset/Yewno/filtered_yewno.pickle'
YEWNO_CONCEPT_DICT_PATH = '/Users/khoanguyen/Workspace/dataset/Yewno/yewno_concept_dict.pickle'
PERIOD_DATA_PATH = '/Users/khoanguyen/Workspace/dataset/edf_msft/'
CONCEPT_COUNT_PATH = '/Users/khoanguyen/Workspace/dataset/Yewno/yewno-edf_concept_count.pickle'
BIGRAM_CONCEPT_COUNT_PATH = '/Users/khoanguyen/Workspace/dataset/Yewno/yewno-edf_bigram_concept_count.pickle'

monthly_file =  ['2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01', '2019-11-01',
                 '2019-12-01', '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01',
                 '2020-05-01', '2020-06-01', '2020-07-01']

# Windows path
if platform.system() == 'Windows':
    YEWNO_DATA_PATH = 'K:\\Lbpam\\DG_Gestion_Quant\\GERANT\\Khoa\\yewno_ms_2020.pickle'
    EDF_DATA_PATH = 'K:\\Lbpam\\DG_Gestion_Quant\\GERANT\\Khoa\\Data\\MSFT\\MSFT_Jul2019_2020_linebreak'
    FILTERED_YEWNO_PATH = 'K:\\Lbpam\\DG_Gestion_Quant\\GERANT\\Khoa\\Data\\filtered_yewno.pickle'
    YEWNO_CONCEPT_DICT_PATH = 'K:\\Lbpam\\DG_Gestion_Quant\\GERANT\\Khoa\\yewno_concept_dict.pickle'
    CONCEPT_COUNT_PATH = 'K:\\Lbpam\\DG_Gestion_Quant\\GERANT\\Khoa\\Data\\yewno-edf_concept_count.pickle'
    BIGRAM_CONCEPT_COUNT_PATH = 'K:\\Lbpam\\DG_Gestion_Quant\\GERANT\\Khoa\\Data\\yewno-edf_bigram_concept_count.pickle'
    PERIOD_DATA_PATH = 'K:\\Lbpam\\DG_Gestion_Quant\\GERANT\\Khoa\\Data\\MSFT\\'

In [4]:
with open(YEWNO_DATA_PATH, 'rb') as f:
    yewno_data = pickle.load(f)

In [5]:
with open(EDF_DATA_PATH, 'rb') as f:
    edf_data = pickle.load(f)

In [6]:
with open(YEWNO_CONCEPT_DICT_PATH, 'rb') as f:
    yewno_concept_dict = pickle.load(f)

In [9]:
yewno_concept_dict.rename(columns={0: 'ID', 1: 'Definition', 2: 'Concept', 3: 'Hypernym', 4: 'Misc'}, inplace=True)

In [12]:
with open(YEWNO_CONCEPT_DICT_PATH, 'wb') as f:
    pickle.dump(yewno_concept_dict, f)

In [7]:
yewno_structure = yewno_concept_dict[3].tolist()

In [None]:
concept_count = []
concept_list = yewno_data['Concept'].tolist()

In [None]:
for concept in tqdm(concept_list):
    # take care of string that contains quantifier, which can cause wonky interaction with re.findall
    if any(x in concept for x in ['+', '#', '.']):
        concept = re.escape(concept)
    # \b is word boundary, which doesn't work with string like 'C++' where there's no word boundary after ++
    re_string = r'(?:^|(?<=\s))' + concept + r'(?=\s|$)'

    # used re.IGNORECASE so that there's no need to convert case for string and text
    concept_per_doc = edf_data['text'].apply(lambda x: len(re.findall(re_string, x, flags=re.IGNORECASE)))
    concept_count.append(concept_per_doc.sum())

In [None]:
yewno_data['edf_count'] = concept_count

In [None]:
filtering_results = yewno_data[yewno_data['edf_count'] >= 10]

In [None]:
filtering_results = filtering_results[(filtering_results['Pureplay'] > 0) &
                                      (filtering_results['Contribution'] > 0)]

In [None]:
with open(FILTERED_YEWNO_PATH, 'wb') as f:
    pickle.dump(filtering_results, f)

In [None]:
with open(FILTERED_YEWNO_PATH, 'rb') as f:
    filter_yewno_data = pickle.load(f)

In [None]:
text_df = []

for data in monthly_file:
    with open(PERIOD_DATA_PATH + data, 'rb') as f:
        df = pickle.load(f)
        text_df.append(df)

In [None]:
concept_list = filter_yewno_data['Concept'].tolist()

word_count_df = pd.DataFrame(index=monthly_file)
period_count = []
word_count = [[] for i in range(len(concept_list))]

for df in tqdm(text_df):
    for i in range(len(concept_list)):
        concept = concept_list[i]

        if any(x in concept for x in ['+', '#', '.']):
            concept = re.escape(concept)

        re_string = r'(?:^|(?<=\s))' + concept + r'(?=\s|$)'
        kw_per_doc = df['text'].str.count(re_string, flags=re.IGNORECASE)
        word_count[i].append(kw_per_doc.sum())

for i in range(len(concept_list)):
    word_count_df[concept_list[i]] = word_count[i]
word_count_df.fillna(0, inplace=True)

In [None]:
%%script false --no-raise-error
# dask testing
import dask.dataframe as dd

dask_dataset = []
for df in text_df:
    ddf = dd.from_pandas(df, npartitions=4)
    dask_dataset.append(ddf)


In [None]:
%%script false --no-raise-error

word_count_df = pd.DataFrame(index=monthly_file)
period_count = []
word_count = [[] for i in range(len(concept_list))]

for ddf in tqdm(dask_dataset):
    for i in range(len(concept_list)):
        concept = concept_list[i]

        if any(x in concept for x in ['+', '#', '.']):
            concept = re.escape(concept)

        re_string = r'(?:^|(?<=\s))' + concept + r'(?=\s|$)'
        findall_doc = ddf['text'].str.count(re_string, flags=re.IGNORECASE)
        word_count[i].append(findall_doc.sum().compute())

for i in range(len(concept_list)):
    word_count_df[concept_list[i]] = word_count[i]
word_count_df.fillna(0, inplace=True)

In [None]:
with open(CONCEPT_COUNT_PATH, 'wb') as f:
    pickle.dump(word_count_df, f)

In [None]:
with open(CONCEPT_COUNT_PATH, 'rb') as f:
    word_count_df = pickle.load(f)

In [None]:
bigram_concept_list = []
for concept in word_count_df.columns.tolist():
    if ' ' in concept:
        bigram_concept_list.append(concept)

bigram_word_count_df = word_count_df[bigram_concept_list]

In [None]:
with open(BIGRAM_CONCEPT_COUNT_PATH, 'wb') as f:
    pickle.dump(bigram_word_count_df, f)

In [None]:
with open(BIGRAM_CONCEPT_COUNT_PATH, 'rb') as f:
    bigram_word_count_df = pickle.load(f)

In [None]:
bigram_word_count_df.T

In [None]:
yewno_filtered_bigram = filter_yewno_data[filter_yewno_data['Concept'].isin(bigram_concept_list)]

## Note on this snippet Yewno data

* Total of 16460 concepts, in 1st half of 2020
* Some concept with 0 Contribution/PurePlay score did not appear in this period
* Some concept is not quite relevant (e.g. "Hello, World!" program, ABCDE, etc.) or are generic (e.g. Computer, Software, etc.)

## Observation

Counting frequency of appearances in EDF Microsoft data
* Only ~960 concepts appeared more than 10 times

From the list of keyword initially picked from previous work
Some keywords did not have any Contribution/PurePlay score in this snippet of Yewno data:
* Digital Transformation
* Microsoft Team

Keywords that appeared as part of other concepts:
* Healthcare (Healthcare Holdings Group, Healthcare Product Holdings)

Keywords that did not appear in Yewno (yet have some relating concepts):
* Remote work
* Storage Server
* Data Center
* Cloud Solution

## Remarks

* Concepts appearances in Yewno before reach the mass media, or haven't make any presence  at all
*

In [None]:
yewno_data[yewno_data['Concept'].str.contains('Remote', case=False)]