## Load The Data

In [1]:
import pandas as pd
import datetime
import numpy as np
import spacy

df = pd.read_csv("colocates.csv", parse_dates= ['date'])
nlp = spacy.load('pl_core_news_sm')
df.colocate = df.colocate.apply(lambda colocate: " ".join([token.lemma_ for token in nlp(colocate)]) )
df['words'] = df.colocate.str.split(" ")
df_exploded = df.explode('words')
df_exploded.drop('colocate', inplace=True, axis=1)

In [47]:
start_date = df_exploded.date.min()
interval_days = 6 * 30
end_date = start_date + datetime.timedelta(days=interval_days)
max_date = df_exploded.date.max()
corpus_list = []
while start_date < max_date:
    df = df_exploded[df_exploded.date.between(start_date, end_date)]
    corpus_list.append(df)
    end_date += datetime.timedelta(days=interval_days/2)
    start_date += datetime.timedelta(days=interval_days/2)

## Windows With Raw Word Count

## Selection of collocates

In [49]:
word_set = set()

for corpus in corpus_list:
    word_set.update(corpus.words)
    
print(f"Number of unique words {len(word_set)}")

Number of unique words 15492


# Vector

1. To establish the relative minimum collocation frequency, we first identify a subcorpus, which provides the smallest amount of evidence for collocation due to the lowest number
of occurrences of the node in that particular subcorpus
2. In other subcorpora with a higher frequency of the node, the minimum frequency threshold is proportionally stricter


For example, if the threshold for the smallest subcorpus is set to be
at least three co-occurrences of the collocate and the node, then the requirement
for a subcorpus that includes twice the number of nodes would be to include at
least twice this number of co-occurrences with the collocate (six in this example)

In [50]:
corpus_counts = [corpus.words.value_counts() for corpus in corpus_list]
node_counts = np.array([corp.text_id.unique().size for corp in corpus_list])

In [51]:
def count_vector(word, corpus_counts = corpus_counts):
    count_list = [corpus_count[word] if word in corpus_count else 0 for corpus_count in corpus_counts]    
    return np.array(count_list)

def count_as_colocate(c_vector, node_counts):
    if (c_vector == 0).any():
        return False
    min_colocation_index = np.argmin(c_vector)
    min_freq = np.min(c_vector)
    count_min = node_counts[min_colocation_index].min()
    for freq, node_count in zip(c_vector, node_counts):
        if freq < min_freq * node_count / count_min:
            return False
    return True

for word in word_set:
    if count_as_colocate(count_vector(word), node_counts):
        print(word)

zmian
i
zmianach
przed
też
globalne
jednak
które
także
coraz
nie
to
ale
są
klimatycznych
który
zmianami
dla
a
ze
na
tylko
do
co
in
zmiany
już


In [None]:
def isPresentAtWindow(window, word):
    return window[word] > 10

vectors = []
for window in df_by_window:
    vector = [isPresentAtWindow(window, word) for word in results]
    vector = np.array(vector)
    vectors.append(vector)

In [None]:
vectors = np.stack(vectors, axis = 0)
time_data = pd.Series(time_periods)

In [None]:
time_data = time_data.reset_index().rename(columns={0: 'date'})

## Save the data

In [None]:
np.save('vectors.npy',vectors)
time_data.to_csv("time_periods.csv", index=False)

## Check

In [None]:
pd.read_csv("time_periods.csv", index_col='index')


In [None]:
np.load("vectors.npy")