In [1]:
from pathlib import Path

import pandas as pd
import textstat

from tqdm import tqdm

from config import FULL, CORPUS, PROBLEM, STOPWORDS

textstat.set_lang("pl")

In [2]:
def read_corpus(f: Path):
    data = []
    with f.open(mode="r", encoding="utf-8") as file:
        current_str = ""
        for line in tqdm(file):
            if line == "\n":
                data.append(current_str)
                current_str = ""
            else:
                line = line.replace("\n", " ")
                current_str += line
    return data

In [3]:
full_file = FULL
twitter_file = CORPUS
problem_file = PROBLEM

In [4]:
full_data = read_corpus(full_file)
len(full_data)

13471299it [00:09, 1389557.84it/s]


3401364

In [5]:
twitter_data = read_corpus(twitter_file)
len(twitter_data)

2178988it [00:01, 1663972.33it/s]


987259

In [6]:
problem_data = read_corpus(problem_file)
len(problem_data)

22844it [00:00, 1640887.12it/s]


11053

In [7]:
stop_words_df = pd.read_csv(STOPWORDS, header=None)
STOPWORDS = set(stop_words_df[0].to_list()).union({'user'})

In [8]:
# Statystyki dla korpusu pełnego

full_text = full_file.read_text(encoding="utf-8")

sent_count_full = 0
for t in full_data:
    sent_count_full += textstat.sentence_count(t)

print("Sentence count:", sent_count_full)
print("Word count:", textstat.lexicon_count(full_text))
print("Letter count:", textstat.letter_count(full_text, ignore_spaces=True))

Sentence count: 10419233
Word count: 140393827
Letter count: 827606132


In [9]:
# Statystyki dla korpusu wzorcowego

twitter_text = twitter_file.read_text(encoding="utf-8")

sent_count_twitter = 0
for t in twitter_data:
    sent_count_twitter += textstat.sentence_count(t)

print("Sentence count:", sent_count_twitter)
print("Word count:", textstat.lexicon_count(twitter_text))
print("Letter count:", textstat.letter_count(twitter_text, ignore_spaces=True))

Sentence count: 1472396
Word count: 16865493
Letter count: 83185373


In [10]:
# Statystyki dla korpusu problemu

problem_text = problem_file.read_text(encoding="utf-8")

sent_count_problem = 0
for t in problem_data:
    sent_count_problem += textstat.sentence_count(t)

print("Sentence count:", sent_count_problem)
print("Word count:", textstat.lexicon_count(problem_text))
print("Letter count:", textstat.letter_count(problem_text, ignore_spaces=True))

Sentence count: 14063
Word count: 133222
Letter count: 859579
