# 1. Data Wrangling

## 1.1 Import and Load 

In [None]:
import pandas as pd
import re 
from html import unescape
from tqdm.notebook import tqdm
tqdm.pandas()
from nltk.corpus import stopwords
from natasha import Doc, MorphVocab, Segmenter, NewsEmbedding, NewsMorphTagger

In [None]:
df = pd.read_json('../data/raw/healthcare_facilities_reviews.jsonl.gz', lines=True, compression='gzip')

## 1.2 Inspect Dataframe

In [3]:
df.columns.tolist()

['review_id', 'category', 'title', 'content', 'sentiment', 'source_url']

In [4]:
df.shape

(70597, 6)

In [5]:
df.info

<bound method DataFrame.info of        review_id                       category  \
0              0  Поликлиники стоматологические   
1              1  Поликлиники стоматологические   
2              2  Поликлиники стоматологические   
3              3  Поликлиники стоматологические   
4              4  Поликлиники стоматологические   
...          ...                            ...   
70592      70592          Водительские комиссии   
70593      70593          Водительские комиссии   
70594      70594          Водительские комиссии   
70595      70595          Водительские комиссии   
70596      70596          Водительские комиссии   

                                   title  \
0                        Классный мастер   
1                     Замечательный врач   
2      Благодарность работникам рентгена   
3                       Доктор Рабинович   
4              Есть кому сказать спасибо   
...                                  ...   
70592          Хуже районной поликлиники   
705

## 1.3 Null values

In [6]:
df.isnull().sum()

review_id     0
category      0
title         0
content       0
sentiment     0
source_url    0
dtype: int64

## 1.4 Duplicates

In [7]:
df['content'].duplicated().sum()

np.int64(209)

In [8]:
df = df.drop_duplicates(subset='content', keep='first')
df.shape

(70388, 6)

## 1.5 Filter reviews

In [9]:
df = df[df['content'].str.split().apply(len) > 5]
df.shape

(70359, 6)

In [10]:
df = df[df['content'].str.match(r'.*[а-яА-Яa-zA-Z].*')]
df.shape

(70349, 6)

## 1.6 Encoding

In [11]:
sentiment_map = {
    "positive": 1,
    "negative": 0
}

df['sentiment_score'] = df['sentiment'].map(sentiment_map)

## 1.7 Sample Selection

In [20]:
df_sample = df.sample(10000, random_state=42)
reviews_sample = df_sample["content"].tolist()
sentiment_sample = df['sentiment'][df_sample.index].tolist()

sample_data = pd.DataFrame({
    'reviews' : reviews_sample,
    'sentiment' : sentiment_sample})

## 1.8 Text Preprocessing

### 1.8.1 General Text Preprocessing

In [13]:
def clean_text(text):
    text = unescape(text)
    text = re.sub(r"<br\s*/?>", " ", text)
    text = re.sub(r"<.*?>", " ", text)
    
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^а-яА-Яa-zA-Z0-9\s]", "", text)
    
    return text

In [14]:
sample_data['processed_reviews'] = sample_data['reviews'].progress_apply(clean_text)

  0%|          | 0/10000 [00:00<?, ?it/s]

### 1.8.2 Lemmatisation and Stopwords removal

In [15]:
segmenter = Segmenter()
morph_vocab = MorphVocab()
embedding = NewsEmbedding()
morph_tagger = NewsMorphTagger(embedding)
russian_stopwords = set(stopwords.words("russian"))

In [16]:
def lemmatise_russian(text):
    doc = Doc(text)
    doc.segment(segmenter)
    doc.tag_morph(morph_tagger)
    for token in doc.tokens:
        token.lemmatize(morph_vocab)
    return " ".join([token.lemma for token in doc.tokens if token.lemma not in russian_stopwords])

In [17]:
sample_data['processed_reviews'] = sample_data['processed_reviews'].progress_apply(lemmatise_russian)

  0%|          | 0/10000 [00:00<?, ?it/s]

## 2. Save Sample Data as CSV

In [None]:
sample_data.to_csv('../data/sample/sample_processed_reviews.csv', index=False)