In [10]:
from pathlib import Path

import pandas as pd
import textstat

from tqdm import tqdm

textstat.set_lang("pl")

In [15]:
def read_corpus(f: Path):
    data = []
    with f.open(mode="r", encoding="utf-8") as file:
        current_str = ""
        for line in tqdm(file):
            if line == "\n":
                data.append(current_str)
                current_str = ""
            else:
                line = line.replace("\n", " ")
                current_str += line
    return data

In [16]:
full_file = Path("full.txt")
twitter_file = Path("corpus.txt")
problem_file = Path("problem.txt")

In [17]:
full_data = read_corpus(full_file)
len(full_data)

13471299it [00:09, 1423540.16it/s]


3401364

In [18]:
twitter_data = read_corpus(twitter_file)
len(twitter_data)

2203079it [00:00, 2249730.44it/s]


991200

In [19]:
problem_data = read_corpus(problem_file)
len(problem_data)

22844it [00:00, 2173555.66it/s]


11053

In [20]:
full_df = pd.DataFrame(data=full_data, columns=['text'])
full_df.head()

Unnamed: 0,text
0,"Zatrzasnął drzwi od mieszkania, dwa razy przek..."
1,"Bohaterem powieści Paźniewskiego jest miasto, ..."
2,Ale dzisiaj? Jaką dzisiaj odegra rolę poetyka ...
3,Halina Auderska we wszystkich książkach każe s...
4,"Paźniewski w ""Krótkich dniach"" ofiarował Kreso..."


In [21]:
twitter_df = pd.DataFrame(data=twitter_data, columns=['text'])
twitter_df.head()

Unnamed: 0,text
0,@user ten w lewym dolnym rogu wyglada ładnie
1,@user @user @user Był mój. Już przeprosiłem. J...
2,@user @user @user @user @user @user @user @use...
3,@user zaprowadzic cie do lazienki andziu?? :]
4,@user @user @user @user @user Chętni na darmow...


In [22]:
problem_df = pd.DataFrame(data=problem_data, columns=['text'])
problem_df.head()

Unnamed: 0,text
0,Dla mnie faworytem do tytułu będzie Cracovia. ...
1,@anonymized_account @anonymized_account Brawo ...
2,"@anonymized_account @anonymized_account Super,..."
3,@anonymized_account @anonymized_account Musi. ...
4,"Odrzut natychmiastowy, kwaśna mina, mam problem"


In [23]:
stop_words_df = pd.read_csv('stopwords.txt', header=None)
STOPWORDS = set(stop_words_df[0].to_list()).union({'user'})

In [48]:
# import spacy
# import seaborn as sns

# nlp = spacy.load("pl_core_news_md")
# df = pd.DataFrame()
# df['doc'] = [nlp(text) for text in problem_df.text]
# df['num_tokens'] = [len(token) for token in df.doc]

# g = sns.histplot(df.num_tokens)


In [45]:
# Statystyki dla korpusu pełnego

full_text = full_file.read_text(encoding="utf-8")

sent_count_full = 0
for t in full_data:
    sent_count_full += textstat.sentence_count(t)

print("Sentence count:", sent_count_full)
print("Word count:", textstat.lexicon_count(full_text))
print("Letter count:", textstat.letter_count(full_text, ignore_spaces=True))

Sentence count: 10419233
Word count: 140393827
Letter count: 827606132


In [44]:
# Statystyki dla korpusu wzorcowego

twitter_text = twitter_file.read_text(encoding="utf-8")

sent_count_twitter = 0
for t in twitter_data:
    sent_count_twitter += textstat.sentence_count(t)

print("Sentence count:", sent_count_twitter)
print("Word count:", textstat.lexicon_count(twitter_text))
print("Letter count:", textstat.letter_count(twitter_text, ignore_spaces=True))

Sentence count: 1520562
Word count: 17857055
Letter count: 88231086


In [43]:
# Statystyki dla korpusu problemu

problem_text = problem_file.read_text(encoding="utf-8")

sent_count_problem = 0
for t in problem_data:
    sent_count_problem += textstat.sentence_count(t)

print("Sentence count:", sent_count_problem)
print("Word count:", textstat.lexicon_count(problem_text))
print("Letter count:", textstat.letter_count(problem_text, ignore_spaces=True))

Sentence count: 14063
Word count: 133222
Letter count: 859579
