In [1]:
import pandas as pd
import os
import regex as re

# Path to stored files
path = "../Corpus"
corp_files = os.listdir(path)
corp_files

['eco_dorzeczy.csv',
 'eco_gpc.csv',
 'eco_newsweek.csv',
 'eco_polityka.csv',
 'eco_rzepa.csv',
 'eco_wpolityce.csv',
 'eco_wprost.csv',
 'eco_wyborcza_colocates.csv',
 'eco_wyborcza.csv']

## Load and concatenate data

In [3]:
data_list = []
cols = ["text", "id", 'date', 'source']

for file in corp_files:
    # Load the data
    data = pd.read_csv(path+"/"+file, parse_dates=['date'])
   
    # Make sure to only include columns of interest
    if 'id' not in data.columns:
        print(f"No id in {file}")
        data['id'] = data.index
    data = data[cols]   

    # Append to list
    data_list.append(data)

df = pd.concat(data_list)

No id in eco_gpc.csv
No id in eco_wyborcza.csv


## Preprocessing

In [None]:
df['text_processed'] = df['text'].apply(lambda x: re.sub(r'[\n\.\-]', ' ', x))
df['text_processed'] = df['text_processed'].apply(lambda x: re.sub(r'\s+', ' ', x))
df['text_processed'] = df['text_processed'].apply(lambda x: re.sub(r'[^\dAaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż ]', '', x))
df['text_processed'] = df['text_processed'].str.lower()

## Extracting collocates

In [None]:
word_regex = r'(?:[AaĄąBbCcĆćDdEeĘęFfGgHhIiJjKkLlŁłMmNnŃńOoÓóPpRrSsŚśTtUuWwYyZzŹźŻż\d]+\s+)'
regex_expression = word_regex + r'{1,5}' + "zmian[^ ]* klimat[^ ]*\s+" + word_regex + r'{1,5}'

df['raw_colocates'] = df['text_processed'].apply(lambda x: re.findall(regex_expression, x))

## Checking the extraction process

In [None]:
df['count'] = df['raw_colocates'].apply(lambda x: len(x))
print(df['count'].sum())

In [None]:
df['real_count'] = df['text_processed'].apply(lambda x: len(re.findall("zmian[^ ]* klimat[^ ]*", x)))
print(df['real_count'].sum())

## Collocates by source

In [None]:
df[df['real_count'] > 0].source.value_counts()

## Saving

In [None]:
df_exploded = df[df['count'] > 0].explode('raw_colocates')
df_exploded.reset_index(drop=True, inplace=True)
df_exploded.rename(columns={'raw_colocates': 'colocate', 'id': 'text_id'}, inplace=True)
df_exploded['id'] = df_exploded.index
df_exploded.drop(columns=['text', 'text_processed', 'count', 'real_count'], inplace=True)
df_exploded.head()

In [None]:
df_exploded.to_csv("colocates.csv", index=False)