## Load The Data

In [62]:
import pandas as pd
import datetime
import numpy as np
import spacy

df = pd.read_csv("colocates.csv", parse_dates= ['date'])
nlp = spacy.load('pl_core_news_sm')
df.colocate = df.colocate.apply(lambda colocate: " ".join([token.lemma_ for token in nlp(colocate)]) )
df['words'] = df.colocate.str.split(" ")
df_exploded = df.explode('words')
df_exploded.drop('colocate', inplace=True, axis=1)

## Windows With Raw Word Count

In [63]:
df_by_window = []
time_periods = []
start_date = df_exploded.date.min()
max_date = df_exploded.date.max()
interval_days = 4 * 30

while True:
    next_date = start_date + datetime.timedelta(interval_days)
    df = df_exploded[df_exploded.date.between(start_date, next_date)]
    words = df.words.value_counts()
    df_by_window.append(words)
    time_periods.append(start_date)
    start_date = next_date
    if next_date > max_date:
        break

## Selection of collocates

In [64]:
word_set = set()

for window in df_by_window:
    word_set.update(window.index)
    
print(f"Number of unique word {len(word_set)}")

Number of unique word 9384


# Vector

In [65]:
def count_as_colocate(word, df_by_window = df_by_window):
    return all([word in window for window in df_by_window])

results = list(filter(count_as_colocate, word_set))
vec_len = len(results)
print(f"Found {vec_len} words which count as collocates")

Found 45 words which count as collocates


In [68]:
def isPresentAtWindow(window, word):
    return window[word] > 10

vectors = []
for window in df_by_window:
    vector = [isPresentAtWindow(window, word) for word in results]
    vector = np.array(vector)
    vectors.append(vector)

In [69]:
vectors = np.stack(vectors, axis = 0)
time_data = pd.Series(time_periods)

In [77]:
time_data = time_data.reset_index().rename(columns={0: 'date'})

## Save the data

In [80]:
np.save('vectors.npy',vectors)
time_data.to_csv("time_periods.csv", index=False)

## Check

In [84]:
pd.read_csv("time_periods.csv", index_col='index')


Unnamed: 0_level_0,date
index,Unnamed: 1_level_1
0,2015-01-02
1,2015-05-02
2,2015-08-30
3,2015-12-28
4,2016-04-26
5,2016-08-24
6,2016-12-22
7,2017-04-21
8,2017-08-19
9,2017-12-17


In [85]:
np.load("vectors.npy")

array([[ True,  True, False, ...,  True,  True, False],
       [ True,  True, False, ...,  True, False, False],
       [ True,  True,  True, ...,  True, False,  True],
       ...,
       [ True,  True, False, ..., False,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True, False, ..., False, False,  True]])