In [None]:
import pandas as pd
import numpy as np
import spacy
from google.colab import drive
from pathlib import Path
from multiprocessing import Pool
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from tqdm import tqdm


!python -m spacy download pl_core_news_sm

drive.mount('/content/drive')
nlp = spacy.load('pl_core_news_sm')
corpus_path = Path("/content/drive/Shareddrives/SKN AI FUW /Projekt z Dagmarą Mateją/Korpusy")
materials_path = Path("/content/drive/Shareddrives/SKN AI FUW /Projekt z Dagmarą Mateją/NLP materials")

In [None]:
stopwords = pd.read_csv( materials_path / 'polish_stopwords.txt', header=None)
STOPWORDS = set([word.rstrip() for word in stopwords[0]])
links_gw = [corpus_path / "Gazeta Wyborcza (do 2018 r.)" / "data" / f"wyborcza_{i}.csv" for i in range(22)]
links_wprost = [corpus_path / "Wprost 2015-2022" / "data" / f"Wprost.csv"]
links_newsweek = [corpus_path / "Newsweek 2015-2022" / 'data' / f"newsweek.csv"]
links_dorzeczy = [corpus_path / "DoRzeczy" / "data" / "dorzeczy.csv"]
links_polityka = [corpus_path / "Polityka" / "data" / f"polityka_{i}.csv" for i in range(5)]
links_gpc = [corpus_path / "Gazeta Polska Codziennie" / "data" / f"gpc_{i}.csv" for i in range(13)]
links_rzepa = [corpus_path / "Rzeczpospolita 2015-2022" / "data"/ f"rzepa_{i}.csv" for i in range(22)]
links_dataset = [materials_path / "dataset.csv"]
links_wpolityce = [corpus_path / "wPolityce" / 'data' / "wPolityce.csv" ]

corpus_links = {
    "newsweek" : links_newsweek,
    "rzepa" : links_rzepa,
    "gpc" : links_gpc,
    "wprost" : links_wprost,
    "polityka" : links_polityka,
    "dorzeczy" : links_dorzeczy,
    "dataset" : links_dataset,
    "wyborcza" : links_gw,
    "wpolityce" : links_wpolityce
}

In [None]:
def generate_ngrams(text, n_gram=1):
    try:
        token = [token for token in text.lower().split(' ') if token != '' if token not in STOPWORDS]
    except:
        token = ["Nan", "Nan", "Nan"]
    ngrams = zip(*[token[i:] for i in range(n_gram)])
    return [' '.join(ngram) for ngram in ngrams]

def get_ngrams():
    df_ngram = pd.read_csv(materials_path / "ngram/ngrams.csv")
    df_weak = pd.read_csv(materials_path / "ngram" / "weak_grams.csv")
    df_ngram = df_ngram.reset_index().set_index("phrase").rename(columns = {"index":"id"})
    return df_ngram, df_weak

def get_vocab(df_ngram):
    eco_vocab = dict(zip(df_ngram.index, df_ngram.id))
    topic_vocab = dict(zip(df_ngram.index, df_ngram.origin))
    return eco_vocab, topic_vocab

def get_topic_counter(df_ngram):
  return dict(zip(df_ngram.origin.unique(), np.zeros_like(df_ngram.origin.unique())))

def count_vectorizer(article):
    vect = np.zeros(len(eco_vocab)).tolist()
    topic_counter = get_topic_counter(df_ngram)
    weak_count = 0
    for ngram in range(1,4):
      for word in generate_ngrams(article['clean_text'], ngram):
          if word in eco_vocab:
            vect[eco_vocab[word]] += ngram
            origin = topic_vocab[word]
            topic_counter[origin] += ngram
          if word in df_weak.phrase.values:
            weak_count += ngram

      for word in generate_ngrams(article['clean_title'], ngram):
          if word in eco_vocab:
            vect[eco_vocab[word]] += 2*ngram

    ngram_sum = np.sum(vect)
    ngram_sum_squared = ngram_sum ** 2
    ngram_sum_squared_to_total = ngram_sum ** 2 / len(article['clean_text'].split(' '))
    topics = np.array(list(topic_counter.values()))

    if ngram_sum != 0:
      topics = topics / ngram_sum
      weak_count /= ngram_sum

    stats = np.array([ngram_sum, ngram_sum_squared, ngram_sum_squared_to_total, weak_count])

    vect = np.concatenate((vect, stats, topics))
    return vect


def get_word_list(vectorized):
    vectorized = vectorized[:len(eco_vocab)]
    indicies = np.nonzero(vectorized)[0]
    words = []

    for i in indicies:
        if i < len(eco_vocab):
            words.extend([phrase for phrase in eco_vocab.keys() if eco_vocab[phrase] == i])

    return words

df_ngram, df_weak = get_ngrams()
stat_list = [
   'ngram_sum', 'ngram_sum_squared', 'ngram_sum_squared_to_total', 'weak_count'
   ] + list(get_topic_counter(df_ngram).keys())

assert len(df_ngram[df_ngram.index.duplicated()]) == 0
eco_vocab, topic_vocab = get_vocab(df_ngram)

In [None]:
def get_model(df):
    if "vectorized" not in df.columns.values:
      df['vectorized'] = df.apply(count_vectorizer, axis=1)
    df_train, df_test = train_test_split(df)
    train_x = df_train['vectorized'].to_list()
    test_x = df_test['vectorized'].to_list()
    train_y = df_train['label']
    test_y = df_test['label']

    train_x = np.array(train_x)
    test_x = np.array(test_x)
    params = {'n_estimators': 386, 'max_depth': 13, 'min_samples_split': 9}
    rf = RandomForestClassifier(**params)
    rf.fit(train_x, train_y)
    y_proba = rf.predict_proba(test_x)
    y_pred = y_proba[:,1] > 0.50
    print(precision_score(test_y, y_pred))
    return rf

def naive_filter(link, rf):
    df = pd.read_csv(link)  # Changed index="id" to index_col="id"
    df['clean_title'] = df['clean_title'].fillna("None None None")
    df['clean_text'] = df['clean_text'].fillna("None None None")
    df['vectorized'] = df.apply(count_vectorizer, axis=1)
    x_data = np.array(df['vectorized'].tolist())
    df['proba'] = rf.predict_proba(x_data)[:,1]
    df['words'] = df['vectorized'].apply(get_word_list)
    df['num_words'] = df['words'].apply(lambda x: len(x))
    return df

def get_trainset(corpus):
  if corpus == "rzepa":
    df = pd.read_csv(links_dataset[0], index_col=['id', 'translated'])
    df.rename(columns = {"class" : "label"}, inplace = True)
  else:
    df_eco = pd.read_csv(materials_path / "rzepa" / "eco_rzepa.csv", index_col='id')
    df_non_eco = pd.read_csv(materials_path / "rzepa" / "non_eco_result.csv", index_col='id')
    df_eco['label'] = 1
    df_non_eco['label'] = 0
    df = pd.concat([df_eco, df_non_eco])
    df['vectorized'] = df['vectorized'].apply(lambda x: x[1:-1].split(" "))
    df['vectorized'] = df['vectorized'].apply(lambda data_list: [float(x) for x in data_list if x != ''])
  df = df.fillna("None")
  return df

def get_resulting_df(links, rf, corpus):
    result = []
    for link in tqdm(links):
       result.append(naive_filter(link, rf))
    df_result = pd.concat(result).set_index('id')
    if len(df_result) < 5 * 10 ** 4  and corpus != "dorzeczy":
      return df_result
    else:
      print(f"{corpus}: {len(df_result)}")
      return df_result[df_result['proba'] > 0.1]

def get_statistics(df_result):
    for i, key in enumerate(stat_list[::-1]):
      df_result[key] = df_result['vectorized'].apply(lambda x: x[-(i + 1)])
    return df_result

def save(corpus, df_result, df_eco, df_non_eco):
    (materials_path / corpus).mkdir(parents=True, exist_ok=True)
    df_eco.to_csv(materials_path/ corpus /"eco_result.csv")
    df_non_eco.to_csv(materials_path/ corpus /"non_eco_result.csv")
    df_result.to_csv(materials_path/ corpus /"results.csv")

def determine_fate(df_result, corpus):
  if corpus == 'rzepa':
    df_eco = df_result[(df_result['proba'] > .9) & (df_result['ngram_sum_squared_to_total'] > 0.75) & (df_result['num_words']  > 3)]
    df_non_eco = df_result[df_result['proba'] <= .25]
  else:
    df_eco = df_result[(df_result['proba'] > .5) & (df_result['ngram_sum_squared_to_total'] > 0.5) & (df_result['num_words']  > 2)]
    df_non_eco = df_result[df_result['proba'] <= .2]
  df_result = df_result[~ (df_result.index.isin(df_non_eco.index.values) | df_result.index.isin(df_eco.index.values)) ]
  print((len(df_result), len(df_eco), len(df_non_eco)))
  return df_result,df_eco,df_non_eco

In [None]:
def preprocess_corpus(corpus):
    if (materials_path / corpus / "eco_result.csv").is_file() | (materials_path / corpus / "non_eco_result.csv").is_file() | (materials_path / corpus / "results.csv").is_file():
      print("Done: " + corpus)
      return
    else:
      print("Processing: " + corpus)
    links = corpus_links[corpus]
    df = get_trainset(corpus)
    rf = get_model(df)
    df_result = get_resulting_df(links, rf, corpus)
    get_statistics(df_result)
    df_result, df_eco, df_non_eco = determine_fate(df_result, corpus)
    save(corpus, df_result, df_eco, df_non_eco)


corpus_links = {
    "newsweek" : links_newsweek,
    "rzepa" : links_rzepa,
    "gpc" : links_gpc,
    "wprost" : links_wprost,
    "polityka" : links_polityka,
    "dorzeczy" : links_dorzeczy,
    "dataset" : links_dataset,
    "wyborcza" : links_gw,
    "wpolityce" : links_wpolityce
}
for corpus in corpus_links.keys():
  if corpus != 'dataset':
    preprocess_corpus(corpus)