In [None]:
from classify import data_pipeline, eco_selector
from resources.get_data import get_current_data
from resources.setup import get_setup
import pandas as pd

In [None]:
def validator(df):
    df = df.drop_duplicates()
    df = df[~df['text'].isna()]
    if 'vectorized' in df.columns.values:
        df = df.drop(columns=['vectorized'])
    if df.index.duplicated().any():
        print("Duplicated index!!!")
        df = df.reset_index()
        df = df.drop(columns = ['id'])
        df = df.rename(columns = {'index': 'id'})
    elif 'id' not in df.columns.values:
        df = df.reset_index()
        df = df.rename(columns = {'index': 'id'})
    df['date'] = pd.to_datetime(df['date'])
    print(f"Found {len(df)} files")
    return df

# Big corpuses

## Rzepa

In [None]:
corp = 'rzepa'
df_rest, df_eco = data_pipeline(corp)
df_final = pd.concat([df_eco, eco_selector(df_rest)])
df_final = validator(df_final)
df_final.to_csv(f"eco_{corp}.csv")

## Wyborcza

In [None]:
corp = 'wyborcza'
df_rest, df_eco = data_pipeline(corp)
df_final = pd.concat([df_eco, eco_selector(df_rest)])
df_final = validator(df_final)
df_final.to_csv(f"eco_{corp}.csv")

In [None]:
df_final = validator(df_final)
df_final.to_csv(f"eco_{corp}.csv")

## Gazeta Polska Codziennie

In [None]:
corp = 'gpc'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest, False)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

## Polityka

In [None]:
corp = 'polityka'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

# Small Corpuses

## Dorzeczy

In [None]:
from datetime import datetime

corp = 'dorzeczy'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = df_eco.reset_index().drop(columns='id').rename(columns={'Unnamed: 0': 'id'}).set_index('id')
df_eco = df_eco[df_eco['date'] < datetime(2023, 1, 1)]
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

## Wprost

In [None]:
corp = 'wprost'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

## Newsweek

In [None]:
corp = 'newsweek'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

## wPolityce

In [None]:
corp = 'wpolityce'
df_rest, df_eco = data_pipeline(corp)
df_eco = pd.concat([df_eco, eco_selector(df_rest)])
df_eco = validator(df_eco)
df_eco.to_csv(f"eco_{corp}.csv")

# Corpus

In [None]:
import os
corps = [file for file in os.listdir() if file.startswith("eco")]
print(corps)

## Dates

## Most Common ngrams

In [None]:
df = pd.concat(data)
len(df)

In [None]:
from files.ngram.ngrams import ngram_counter
df = df[~df['clean_text'].isna()]
for ngram in range(1, 4):
    df_ngram = ngram_counter(ngram, df).iloc[:100]
    df_ngram.to_csv(f"most_common_{ngram}-gram.csv", index=False)

In [None]:
import numpy as np
from scipy.optimize import curve_fit
import matplotlib.pyplot as plt
x = df_ngram.index
y = df_ngram[1]
# Define the function to fit
def func(x, A, B, C):
    return A * np.power(x, B) + C

# Fit the data
params, _ = curve_fit(func, x, y)

# Extract the fitted parameters
A, B, C = params

# Print the fitted parameters
print(f"A: {A}, B: {B}, C: {C}")
y = df_ngram[1] 
plt.plot(df_ngram[1])
plt.plot(func(x, A, B, C))

## Selected ngrams distribution

In [None]:
import regex as re
import matplotlib.pyplot as plt

In [None]:
data = []
for corp in corps:
    df = pd.read_csv(corp, usecols=['clean_text','date', 'source'], parse_dates=['date'])
    data.append(df)
    
data = pd.concat(data)

In [None]:
data['climate_change_count'] = data['clean_text'].str.count(r"zmiana klimat", flags=re.IGNORECASE)
data['global_warming_count'] = data['clean_text'].str.count(r"globalny ocieplenie", flags=re.IGNORECASE)


In [None]:
global_warming_count = data.groupby(data.date.dt.year).global_warming_count.sum()
plt.figure(figsize=(12, 8))
plt.bar(global_warming_count.index, global_warming_count)
plt.xlabel("Year")
plt.ylabel("Number of mentions")
plt.title("How many times 'globalny ocieplenie' appeared in the lemmatized text in a given year")
plt.savefig("global_warming.png")
plt.xlabel("Year")

In [None]:
plt.figure(figsize=(12,8))
climate_change_count = data.groupby(data.date.dt.year).climate_change_count.sum()
#climate_change_count = data.groupby(pd.Grouper(key='date', freq='M')).climate_change_count.sum()
plt.bar(climate_change_count.index, climate_change_count)
plt.suptitle("How many times 'zmiana klimat' appeared in the lemmatized text in a given a month", fontsize=15)
plt.ylabel("Number of mentions")
plt.xlabel("Year")
plt.savefig("climate_change.png")