In [42]:
import os
import random
import re
import string

import nltk
import numpy as np
import pandas as pd

from scipy import spatial

import gensim.downloader as api

from nltk import word_tokenize
from nltk.corpus import stopwords

nltk.download("stopwords")
nltk.download("punkt")

SEED = 42
random.seed(SEED)
os.environ["PYTHONHASHSEED"] = str(SEED)
np.random.seed(SEED)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gosha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gosha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [43]:
def clean_text(text, tokenizer, stopwords):
    """Обработка текстовых данных, генерация токенов
    Аргументы:
        text: текст, подвергающийся токенизации
    Выходные данные:
        Токенизированный текст в виде структуры python list
    Source: https://dylancastillo.co/nlp-snippets-cluster-documents-using-word2vec/#clean-and-tokenize-data
    """
    # Очистка текста для токенизации
    text = str(text).lower()  # приведение к одному регистру
    text = re.sub(r"\[(.*?)\]", "", text)  # удаление лишних символов [+XYZ chars]
    text = re.sub(r"\s+", " ", text)  # удаление нескольких пробелов
    text = re.sub(r"\w+…|…", "", text)  # удаление повторений
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)  # замена тире
    text = re.sub(
        f"[{re.escape(string.punctuation)}]", "", text
    )  # удаление пунктуации

    tokens = tokenizer(text)  # токенизация
    tokens = [t for t in tokens if not t in stopwords]  # удаление стоп слов
    tokens = ["" if t.isdigit() else t for t in tokens]  # удаление цифр
    tokens = list(set([t for t in tokens if len(t) > 1]))  # удаление токенов длиной меньше 1
    return tokens

In [44]:
#список стоп слов
stop_list = set(stopwords.words('english') + ['also', 'including', 'offering', 'equity'
    'tend', 'less', 'whose', 'involved', 'related', 'includes', 'engaged', 'using', 'contains'])

In [45]:
def vectorize(element, model):
    """Векторизация элемента с помощью эмбеддингов
    Аргументы:
        element: элемент для векторизации
        model: эмбеддинги или натренированная модель 

    Возвращаемые данные:
        Векторное представление элемента
    """
    zero_vector = np.zeros(model.vector_size) #создаём нулевой вектор
    vectors = [] #подготавливаем массив для хранения всех векторизированных токенов 
    for token in element: #цикл получения эмбеддингов для токенов текста 
        if token in model:
            try:
                vectors.append(model[token])
            except KeyError:
                continue
    if vectors: #возврат извлчённых признаков после использования эмбеддингов 
        vectors = np.asarray(vectors)
        avg_vec = vectors.mean(axis=0)
        return avg_vec
    else:
        return zero_vector #либо возврат нулевого вектора в случае отсутствия достаточных данных об эмбеддинагх

In [46]:
#загрузка Word2Vec
w2vec = api.load('glove-wiki-gigaword-300')

In [47]:
#Загрузка текстовых описаний категорий
with open('class_description.txt', 'r') as file: #импорт описаний из файла
    content = file.read().split('---\n')[1:]

cat_dict = {} #Словарь категорий
for i in range(len(content)): #Формирование описаний
    key = content[i].split('\n')[0].replace('Sector', '').strip()
    text = ' '.join(content[i].split('\n')[1:]).strip()
    token = clean_text(text, word_tokenize, stop_list)
    emb = vectorize(token, w2vec)
    cat_dict.update({key: pd.DataFrame([[i, text, token, emb]], columns = ['cat_num', 'desc', 'token', 'embedding'])})

In [48]:
cat_dict.keys()

dict_keys(['Energy', 'Materials', 'Industrials', 'Consumer Discretionary', 'Consumer Staples', 'Health Care', 'Financials', 'Information Technology', 'Communication Services', 'Utilities', 'Real Estate'])

In [49]:
if os.path.exists(os.getcwd()+'\\data\\'):
    path = os.getcwd()+'\\data\\'
    print('OK - Path exists')
else:
    raise EnvironmentError('Нет пути к данным')

OSError: Нет пути к данным

In [50]:
df_raw = pd.read_csv('D:\\Notebooks\\DS_Ya_Praktikum\\workshop1-YaPraktikum\\data\\'+"kaggle_startups_train_27042024.csv")
df = df_raw.copy()

In [51]:
df['category_list'] = df['category_list'].str.replace('|', ' ')

In [52]:
df['category_list'] = df['category_list'].fillna('')

Unnamed: 0,name,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at,closed_at
0,Lunchgate,Online Reservations Restaurants,828626.0,operating,CHE,25,Zurich,Zürich,2,2009-12-31,2011-05-01,2014-12-01,
1,EarLens,Manufacturing Medical Medical Devices,42935019.0,operating,USA,CA,SF Bay Area,Redwood City,4,2005-01-01,2010-05-04,2014-02-25,
2,Reviva Pharmaceuticals,Biotechnology,35456381.0,operating,USA,CA,SF Bay Area,San Jose,3,2006-01-01,2012-08-20,2014-07-02,
3,Sancilio and Company,Health Care,22250000.0,operating,,,,,3,2004-01-01,2011-09-01,2014-07-18,
4,WireTough Cylinders,Manufacturing,,operating,USA,VA,VA - Other,Bristol,1,2010-09-30,2012-02-01,2012-02-01,
5,Connected Sports Ventures,Mobile,4300000.0,operating,USA,NJ,Newark,Princeton,1,2011-01-20,2012-11-12,2012-11-12,
6,Attensity,Analytics Business Analytics Social CRM Social...,90000000.0,operating,USA,CA,SF Bay Area,Redwood City,1,2000-01-01,2014-05-14,2014-05-14,
7,Mesh Networks,Software,4300000.0,operating,USA,TX,Houston,Houston,1,2005-01-01,2014-11-09,2014-11-09,
8,AngioScore,Biotechnology,42000000.0,operating,USA,CA,SF Bay Area,Fremont,2,2003-01-01,2007-10-09,2011-04-20,
9,Vidatronic,Semiconductors,1250500.0,operating,USA,TX,Austin,College Station,2,2010-01-01,2011-08-23,2013-03-21,


In [54]:
df['tokens'] = df['category_list'].apply(lambda x: clean_text(x, word_tokenize, stop_list))

In [55]:
df['embedding'] = df['tokens'].apply(lambda x: vectorize(x, w2vec))

In [86]:
'''def sim_check(arg1, arg2):
    Вычисление косинусного расстояния
    Аргументы:
    arg1, arg2 - векторы np.array
    return (1-spatial.distance.cosine(arg1, arg2))
spatial.distance.cosine(np.array([], arg2)'''
                        
def semantic_similarity(element, sim_dict):
    dist = []
    if np.any(element):
        for i in sim_dict.keys():
            emb = sim_dict.get(i)['embedding'][0]
            dist.append([i, 1-spatial.distance.cosine(element, emb)])
        dist = np.array(dist)
        similar_id = np.argmax(dist[:, 1])
        return dist[similar_id]
    else:
        return ['Generic', 0]

In [88]:
df['categories'] = df['embedding'].apply(lambda x: semantic_similarity(x, cat_dict)[0])

In [87]:
semantic_similarity(df['embedding'][0], cat_dict)

array(['Communication Services', '0.5646764481322059'], dtype='<U32')

In [89]:
df[['category_list', 'categories']].head(30)

Unnamed: 0,category_list,categories
0,Online Reservations Restaurants,Communication Services
1,Manufacturing Medical Medical Devices,Health Care
2,Biotechnology,Health Care
3,Health Care,Health Care
4,Manufacturing,Consumer Discretionary
5,Mobile,Information Technology
6,Analytics Business Analytics Social CRM Social...,Communication Services
7,Software,Information Technology
8,Biotechnology,Health Care
9,Semiconductors,Information Technology


In [90]:
df['categories'].value_counts()

categories
Communication Services    17459
Information Technology    10678
Health Care                9580
Industrials                3872
Financials                 3028
Consumer Discretionary     2561
Generic                    2482
Real Estate                1197
Consumer Staples            821
Energy                      395
Utilities                   360
Materials                    81
Name: count, dtype: int64