In [1]:
from classify import data_pipeline, eco_selector
import pandas as pd
from datetime import datetime

In [12]:
def validator(df):
    df['date'] = pd.to_datetime(df['date'])
    if df.duplicated().any():
        print("Found duplicated rows")
        df = df.drop_duplicates()
    if df.text.isna().any():
        print("Found NaN values in text column")
        df = df.dropna(subset=['text'])
    if 'vectorized' in df.columns.values:
        print("Found vectorized column")
        df = df.drop(columns=['vectorized'])
    if df.index.duplicated().any():
        print("Duplicated index!!!")
        df = df.reset_index()
        df = df.drop(columns = ['id'])
        df = df.rename(columns = {'index': 'id'})
    elif 'id' not in df.reset_index().columns.values:
        print("No id column found")
        df = df.reset_index()
        df = df.rename(columns = {'index': 'id'})
    if df.date.isna().any():
        print("Found NaN values in date column")
        df = df.dropna(subset=['date'])
    df = df[df['date'] < datetime(2023, 1, 1)]
    print(f"Found {len(df)} files")
    return df


# Big corpuses

## Rzepa

In [3]:
corp = 'rzepa'
df_rest, df_eco = data_pipeline(corp)
df_final = pd.concat([df_eco, eco_selector(df_rest, main = True)])
df_final = validator(df_final)
df_final.to_csv(f"eco_{corp}.csv")

{1: 'rzepa', 2: 'gpc', 3: 'newsweek', 4: 'wprost', 5: 'dorzeczy', 6: 'polityka', 7: 'wyborcza', 8: 'wpolityce'}
rzepa
Configuration finished


## Wyborcza

In [13]:
corp = 'wyborcza'
df_rest, df_eco = data_pipeline(corp)
df_final = pd.concat([df_eco, eco_selector(df_rest, main = True)])
df_final = validator(df_final)
df_final.to_csv(f"eco_{corp}.csv")

{1: 'rzepa', 2: 'gpc', 3: 'newsweek', 4: 'wprost', 5: 'dorzeczy', 6: 'polityka', 7: 'wyborcza', 8: 'wpolityce'}
wyborcza
Configuration finished
Dataset of 10248 samples
label
1    5124
0    5124
Name: count, dtype: int64
Preprocessing ended
Accuracy of model 0.9980487804878049


100%|██████████| 130/130 [03:08<00:00,  1.45s/it]


55153 1117
Found vectorized column
Duplicated index!!!
Found 6241 files


## Gazeta Polska Codziennie

In [14]:
corp = 'gpc'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest, False)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

{1: 'rzepa', 2: 'gpc', 3: 'newsweek', 4: 'wprost', 5: 'dorzeczy', 6: 'polityka', 7: 'wyborcza', 8: 'wpolityce'}
gpc
Configuration finished
Dataset of 14640 samples
label
0    7454
1    7186
Name: count, dtype: int64
Preprocessing ended
Accuracy of model 0.9986338797814208


100%|██████████| 31/31 [01:06<00:00,  2.15s/it]


10645 1788
Found duplicated rows
Found vectorized column
Duplicated index!!!
Found 3874 files


## Polityka

In [15]:
corp = 'polityka'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

{1: 'rzepa', 2: 'gpc', 3: 'newsweek', 4: 'wprost', 5: 'dorzeczy', 6: 'polityka', 7: 'wyborcza', 8: 'wpolityce'}
polityka
Configuration finished
Dataset of 8508 samples
label
1    4338
0    4170
Name: count, dtype: int64
Preprocessing ended
Accuracy of model 0.9917743830787309


100%|██████████| 6/6 [00:47<00:00,  7.89s/it]


2162 281
Found vectorized column
Found 950 files


# Small Corpuses

## Dorzeczy

In [3]:

corp = 'dorzeczy'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

{1: 'rzepa', 2: 'gpc', 3: 'newsweek', 4: 'wprost', 5: 'dorzeczy', 6: 'polityka', 7: 'wyborcza', 8: 'wpolityce'}
dorzeczy
Configuration finished
Dataset of 7186 samples
label
0    3620
1    3566
Name: count, dtype: int64
Preprocessing ended
Accuracy of model 0.9944367176634215


100%|██████████| 3/3 [00:13<00:00,  4.44s/it]


1032 100
Found vectorized column
Found 309 files


## Wprost

In [4]:
corp = 'wprost'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

{1: 'rzepa', 2: 'gpc', 3: 'newsweek', 4: 'wprost', 5: 'dorzeczy', 6: 'polityka', 7: 'wyborcza', 8: 'wpolityce'}
wprost
Configuration finished
Dataset of 7454 samples
label
1    3850
0    3604
Name: count, dtype: int64
Preprocessing ended
Accuracy of model 0.9932975871313673


100%|██████████| 3/3 [00:09<00:00,  3.19s/it]


857 113
Found vectorized column
Found 538 files


## Newsweek

In [5]:
corp = 'newsweek'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

{1: 'rzepa', 2: 'gpc', 3: 'newsweek', 4: 'wprost', 5: 'dorzeczy', 6: 'polityka', 7: 'wyborcza', 8: 'wpolityce'}
newsweek
Configuration finished


Dataset of 8340 samples
label
0    4408
1    3932
Name: count, dtype: int64
Preprocessing ended
Accuracy of model 0.9940047961630696


100%|██████████| 6/6 [00:38<00:00,  6.46s/it]


2109 179
Found vectorized column
Found 645 files


## wPolityce

In [6]:
corp = 'wpolityce'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

{1: 'rzepa', 2: 'gpc', 3: 'newsweek', 4: 'wprost', 5: 'dorzeczy', 6: 'polityka', 7: 'wyborcza', 8: 'wpolityce'}
wpolityce
Configuration finished


Dataset of 7666 samples
label
0    3882
1    3784
Name: count, dtype: int64
Preprocessing ended
Accuracy of model 0.9921773142112125


100%|██████████| 4/4 [00:49<00:00, 12.41s/it]


1536 145
Found vectorized column
Found 470 files
