In [1]:
# загрузка данных
from zipfile import ZipFile

PATH_TO_DATA = 'data.zip'

zip_data = ZipFile(PATH_TO_DATA)
filenames = [filename.filename for filename in zip_data.infolist()]
filenames[:2]

['data/', 'data/00e58afe-3ef5-42a6-92f3-8ee7abf868e1.json']

In [2]:
# посмотрим содержимое одного json файла
import json
json.loads(zip_data.read(filenames[1]))

[{'id': '60229527',
  'text': 'Curraghglass, an Anglicisation of the Gaelic, ‘Currach Glas’ meaning The Green Moor, is a townland in the civil parish of Templeport, County Cavan, Ireland. It lies in the Roman Catholic parish of Glangevlin and barony of Tullyhaw. Road at Curraghglass (geograph 3586592) ==Geography== Curraghglass is bounded on the north by Garvalt Lower and Gub (Glangevlin) townlands, on the west by Altshallan, Carrick West and Knockgorm townlands, on the south by Legatraghta and Moneensauran townlands and on the east by Tullynacross (Glangevlin) townland. Its chief geographical features are the Owenmore River (County Cavan), mountain streams, waterfalls, gravel pits and spring wells. The townland is traversed by minor public roads and rural lanes. The townland covers 175 statute acres. ==History== In earlier times the townland was probably uninhabited as it consists mainly of bog and poor clay soils. It was not seized by the English during the Plantation of Ulster in 16

In [3]:
# посмотрим на таблицу из одного json
import pandas as pd
file_content = json.loads(zip_data.read(filenames[1]))
data_from_dict = pd.DataFrame(file_content)
data_from_dict.head(5)

Unnamed: 0,id,text,title
0,60229527,"Curraghglass, an Anglicisation of the Gaelic, ...",Curraghglass
1,60229537,Anthony Joshua vs. Andy Ruiz Jr. was a heavywe...,Anthony Joshua vs. Andy Ruiz Jr.
2,60229541,The men's freestyle welterweight competition a...,Wrestling at the 1956 Summer Olympics – Men's ...
3,60229580,The Munich-Montserrat Book of Hours is a 1535 ...,Munich-Montserrat Book of Hours
4,60229585,Valdó Szűcs (born 29 June 1995) is a Hungarian...,Valdó Szűcs


In [4]:
# сделаем так для каждого файла в архиве
data_list = list()
for filename in filenames:
    # некоторые файлы могут быть пустыми
    # добавим проверку
    text_content = zip_data.read(filename)
    if len(text_content):
        file_content = json.loads(text_content)
        data_from_dict = pd.DataFrame(file_content)
        data_list.append(data_from_dict)
    else:
        continue

# и склеим все таблички в одну
data = pd.concat(data_list, ignore_index=True)
data.head(5)

Unnamed: 0,id,text,title
0,60229527,"Curraghglass, an Anglicisation of the Gaelic, ...",Curraghglass
1,60229537,Anthony Joshua vs. Andy Ruiz Jr. was a heavywe...,Anthony Joshua vs. Andy Ruiz Jr.
2,60229541,The men's freestyle welterweight competition a...,Wrestling at the 1956 Summer Olympics – Men's ...
3,60229580,The Munich-Montserrat Book of Hours is a 1535 ...,Munich-Montserrat Book of Hours
4,60229585,Valdó Szűcs (born 29 June 1995) is a Hungarian...,Valdó Szűcs


In [5]:
data.shape

(24661, 3)

In [6]:
# отбросим лишнее из датафрейма
data.drop(columns=['id', 'title'], inplace=True)
data.head(5)

Unnamed: 0,text
0,"Curraghglass, an Anglicisation of the Gaelic, ..."
1,Anthony Joshua vs. Andy Ruiz Jr. was a heavywe...
2,The men's freestyle welterweight competition a...
3,The Munich-Montserrat Book of Hours is a 1535 ...
4,Valdó Szűcs (born 29 June 1995) is a Hungarian...


In [7]:
# диапазон для киррилического алфавита в нижнем регистре
sorted(list("абвгдаеёжзийклмнопрстуфхцчшщьъыэюя".encode("utf-8")))

[128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 145,
 176,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 188,
 189,
 190,
 191,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 208,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209,
 209]

In [8]:
# диапазон для латинского алфавита в нижнем регистре
sorted(list("abcdefghijklmnopqrstuvwxyz".encode("utf-8")))

[97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122]

Кириллические символы в кодировке Unicode лежат в диапазоне от 128 до 209.

In [9]:
# удаление пунктуации, служебных символов
# конвертация чисел в слова
def remove_punct(text):
    import string
    # удаляем пунктуацию и  
    # склеиваем символы, приводя слова к нижнему регистру
    return ''.join([c for c in text if c not in string.punctuation and (c.isalnum() or c == ' ')]).lower()

def numbers_to_words(text):
    import re
    from num2words import num2words
    # конвертируем числа в строки
    words = text.split()
    for i, word in enumerate(words):
        if re.match(r'\d+', word):
            try:
                words[i] = num2words(int(word), lang='en')
            except ValueError:
                pass
    return ' '.join(words)

def remove_tags(text):
    import re
    # удаление тегов разметки
    text = re.sub(r"<[^>]+>", "", str(text), flags=re.S)
    # удаление последовательностей \r и \n
    text = re.sub(r"\r|\n", " ", str(text), flags=re.S)
    # удаление последовательностей \b
    text = re.sub(r"\b", "", str(text), flags=re.S)
    # удаление лишних цифр из текста
    text = re.sub(r"\d+", "", str(text), flags=re.S)
    # удаление лишних пробелов
    text = re.sub(r" +", " ", str(text), flags=re.S)
    return text

# в идеале, конечно, брать диапазон таблицы кодировки Unicode и оставлять только символы,
# которые в него входят, чтобы не было таких странных токенов, как выше.
def remove_non_latin_letter(text):
    new_text = list()
    for token in text.split(' '):
        if all(map(lambda x:  x <= 122 and x >= 97,  list(token.encode("utf-8")))):
            new_text.append(token)
        else:
            continue
    return ' '.join(new_text)

data['text_prepared'] = (data['text']
                            .apply(remove_punct)
                            .apply(numbers_to_words)
                            .apply(remove_tags)
                            .apply(remove_punct)
                            .apply(remove_non_latin_letter)
                        )
data.head(20)

Unnamed: 0,text,text_prepared
0,"Curraghglass, an Anglicisation of the Gaelic, ...",curraghglass an anglicisation of the gaelic cu...
1,Anthony Joshua vs. Andy Ruiz Jr. was a heavywe...,anthony joshua vs andy ruiz jr was a heavyweig...
2,The men's freestyle welterweight competition a...,the mens freestyle welterweight competition at...
3,The Munich-Montserrat Book of Hours is a 1535 ...,the munichmontserrat book of hours is a one th...
4,Valdó Szűcs (born 29 June 1995) is a Hungarian...,born twentynine june one thousand nine hundred...
5,Backstop behind home plate at alt=Half of a ba...,backstop behind home plate at althalf of a bas...
6,Muhammad Latif (born 13 February 1939) is a Pa...,muhammad latif born thirteen february one thou...
7,Hephaestus is the name of a fictional characte...,hephaestus is the name of a fictional characte...
8,Wilfred A Randall (1914 – date of death unknow...,wilfred a randall one thousand nine hundred an...
9,"John ""Jack"" Harrison Prescott (fourth ¼ 1890 –...",john jack harrison prescott fourth one thousan...


In [10]:
# после удаления всех символов, которые не попадают в диапазон латинского алфавита в нижнем регистре
# в кодировке Unicode, появились пустые тексты, смотрим пропуски
data[data['text_prepared'] == '']['text_prepared'].count()

6

In [11]:
data.drop(index=data[data['text_prepared'] == ''].index, inplace=True)

In [12]:
#токенизация, лемматизация и удаление стоп-слов
import pymorphy3
from nltk.corpus import stopwords
from functools import lru_cache
from tqdm import tqdm

# кешируем слова для быстрого доступа к ним в других случаях
@lru_cache(maxsize=None) 
def norm_form(word):
    return morph.parse(word)[0].normal_form

lemmatized = list()
morph = pymorphy3.MorphAnalyzer()
for text in tqdm(data['text_prepared']):
    # приведём все слова к нормальной форме и удалим малозначимые слова
    lemmatized.append(' '.join([norm_form(word) for word in text.split(' ') \
                 if word not in stopwords.words('english')]).strip())
data['text_lemmatized'] = lemmatized
data.head(10)

100%|█████████████████████████████████████| 24655/24655 [11:23<00:00, 36.09it/s]


Unnamed: 0,text,text_prepared,text_lemmatized
0,"Curraghglass, an Anglicisation of the Gaelic, ...",curraghglass an anglicisation of the gaelic cu...,curraghglass anglicisation gaelic currach glas...
1,Anthony Joshua vs. Andy Ruiz Jr. was a heavywe...,anthony joshua vs andy ruiz jr was a heavyweig...,anthony joshua vs andy ruiz jr heavyweight pro...
2,The men's freestyle welterweight competition a...,the mens freestyle welterweight competition at...,mens freestyle welterweight competition one th...
3,The Munich-Montserrat Book of Hours is a 1535 ...,the munichmontserrat book of hours is a one th...,munichmontserrat book hours one thousand five ...
4,Valdó Szűcs (born 29 June 1995) is a Hungarian...,born twentynine june one thousand nine hundred...,born twentynine june one thousand nine hundred...
5,Backstop behind home plate at alt=Half of a ba...,backstop behind home plate at althalf of a bas...,backstop behind home plate althalf baseball st...
6,Muhammad Latif (born 13 February 1939) is a Pa...,muhammad latif born thirteen february one thou...,muhammad latif born thirteen february one thou...
7,Hephaestus is the name of a fictional characte...,hephaestus is the name of a fictional characte...,hephaestus name fictional character appearing ...
8,Wilfred A Randall (1914 – date of death unknow...,wilfred a randall one thousand nine hundred an...,wilfred randall one thousand nine hundred four...
9,"John ""Jack"" Harrison Prescott (fourth ¼ 1890 –...",john jack harrison prescott fourth one thousan...,john jack harrison prescott fourth one thousan...


ТЕКСТ НЕ ЛЕММАТИЗИРОВАН, ПОТОМУ ЧТО PYMORPHY РАБОТАЕТ ТОЛЬКО С РУССКИМ И УКРАИНСКИМ ЯЗЫКАМИ.

In [13]:
# поиск значимых слов/биграмм/триграмм
# сразу используем TF-IDF, чтобы избежать влияния закона Ципфа и найти действительно самые частотные слова
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer_all = TfidfVectorizer(ngram_range=(1,3))

tfidf_matrix_all = vectorizer_all.fit_transform(data["text_lemmatized"])

feature_names_tfidf = vectorizer_all.get_feature_names_out()
feature_names_tfidf

array(['aa', 'aa aa', 'aa aa indicate', ..., 'zzzyva', 'zzzyva two',
       'zzzyva two thousand'], dtype=object)

In [15]:
# !pip install gensim

Defaulting to user installation because normal site-packages is not writeable
[33mDEPRECATION: gpg 1.14.0-unknown has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of gpg or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m

In [18]:
from gensim import corpora
import numpy as np
#  формируем словарь, содержащий количество раз, когда слово появляется в обучающем наборе. 
# При этом словарь фильтруем, и выбираем токены, которые появляются не менее, 
# чем в 1 документе, а также, которые появляются более, чем в 0,8 документах от общего объема корпуса.
data['tokenized'] = data['text_lemmatized'].apply(lambda x: np.array(x.split(' ')))
dictionary = corpora.Dictionary(data['tokenized'])
dictionary.filter_extremes(no_below=1, no_above=0.8)

In [22]:
from gensim import models
# для каждого документа мы создаем словарь, в котором хранится информация, 
# какое слово сколько раз встречается
corpus = [dictionary.doc2bow(text) for text in data['tokenized']]
lda = models.LdaModel(corpus, num_topics=40, id2word=dictionary)
lda.show_topics(10)

[(1,
  '0.119*"japan" + 0.090*"japanese" + 0.028*"tokyo" + 0.027*"baltimore" + 0.023*"prefecture" + 0.018*"massachusetts" + 0.013*"categoryjapanese" + 0.012*"haitian" + 0.010*"nova" + 0.009*"candle"'),
 (0,
  '0.138*"de" + 0.050*"la" + 0.031*"san" + 0.024*"el" + 0.020*"del" + 0.016*"spanish" + 0.014*"mexico" + 0.012*"two" + 0.012*"province" + 0.012*"spain"'),
 (29,
  '0.057*"nine" + 0.015*"art" + 0.014*"two" + 0.014*"new" + 0.012*"york" + 0.011*"museum" + 0.008*"work" + 0.007*"american" + 0.006*"book" + 0.006*"published"'),
 (28,
  '0.056*"two" + 0.025*"players" + 0.019*"season" + 0.017*"football" + 0.017*"nine" + 0.017*"league" + 0.014*"team" + 0.010*"club" + 0.010*"first" + 0.009*"played"'),
 (35,
  '0.024*"two" + 0.012*"john" + 0.009*"valigntop" + 0.007*"david" + 0.007*"michael" + 0.007*"pa" + 0.006*"smith" + 0.006*"three" + 0.006*"brown" + 0.006*"norton"'),
 (10,
  '0.024*"two" + 0.024*"nine" + 0.015*"eight" + 0.015*"district" + 0.012*"party" + 0.010*"election" + 0.010*"state" + 0.

In [25]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda, corpus, dictionary=lda.id2word)
vis