# Rf plot

In [None]:
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn import tree
import matplotlib.pyplot as plt

with open('plots/model.pkl', 'rb') as f:
    clf = pickle.load(f)

plt.figure(figsize=(30, 15))
tree.plot_tree(clf.estimators_[0], filled=True)
plt.savefig('plots/rf_tree.png', dpi=900)


# Rzepa plots

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from classify import data_pipeline, eco_selector
import pandas as pd

def validator(df):
    df = df.drop_duplicates()
    df = df[~df['text'].isna()]
    if 'vectorized' in df.columns.values:
        df = df.drop(columns=['vectorized'])
    if df.index.duplicated().any():
        print("Duplicated index!!!")
        df = df.reset_index()
        df = df.drop(columns = ['id'])
        df = df.rename(columns = {'index': 'id'})
    elif 'id' not in df.columns.values:
        df = df.reset_index()
        df = df.rename(columns = {'index': 'id'})
    df['date'] = pd.to_datetime(df['date'])
    print(f"Found {len(df)} files")
    return df

In [None]:
corp = 'rzepa'
df_rest, df_eco = data_pipeline(corp)
df_final = pd.concat([df_eco, eco_selector(df_rest)])
df_final = validator(df_final)
df_final.to_csv(f"eco_{corp}.csv")

plt.style.use('ggplot')
plt.figure(figsize=(10, 5))
sns.histplot(df_rest, x='proba')
plt.ylabel('Number of articles')
plt.xlabel('Probability of an article being related to climate change')
plt.yscale('log')
plt.savefig('plots/proba_hist.png', dpi = 600)

In [None]:

plt.figure(figsize=(10, 5))
sns.histplot(df_rest[df_rest['ngram_sum_squared_to_total'].between(0, 100)], x='ngram_sum_squared_to_total', bins=50,)
plt.xlabel('Squared number of selected ngrams found in the article devided by the number of tokens')
plt.ylabel('Number of articles')
plt.yscale('log')
plt.savefig('plots/ngarm_hist.png', dpi = 600)

In [None]:
plt.figure(figsize=(20, 10))
sns.histplot(df_final, x='ngram_sum_squared_to_total', y='proba', bins=(150, 25), cbar=True, cmap='viridis', vmax=50)
plt.xlabel('Squared number of selected ngrams found in the article devided by the number of tokens')
plt.ylabel('Probability of an article being related to climate change')
plt.xlim(0, 100)
plt.ylim(0.9, 1)
plt.savefig('plots/ngarm_proba_hist.png', dpi = 600)

# Tsne plots

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')
df_tsne = pd.read_csv("plots/tsne.csv")
plt.figure(figsize=(16,10))
"t-SNE results on input data for the classification layer of neural network"
plt.title("Rezultat t-SNE na danych wejściowych warstwy klasyfikującej sieci neuronowej")
plt.scatter(df_tsne[df_tsne["class"] == 0]["x"], df_tsne[df_tsne["class"] == 0]["y"], label="Inne", alpha=0.5)
plt.scatter(df_tsne[df_tsne["class"] == 1]["x"], df_tsne[df_tsne["class"] == 1]["y"], label="Artykuł związany z klimatem", alpha=0.5)
plt.legend()
plt.xlabel("t-SNE wymiar x")
plt.ylabel("t-SNE wymiar y")
plt.savefig("plots/tsne.png", dpi=600)