In [None]:
import pandas as pd 
import numpy as np
import sys
sys.path.append('..')
import Classification.resources.preprocessing as preprocessing
import Classification.resources.setup as setup

In [None]:
df = pd.read_csv('../Classification/files/dataset.csv')
df.rename(columns={'class': 'label'}, inplace=True)
df['label'].value_counts()

In [None]:
stopwords = pd.read_csv('../Classification/files/polish_stopwords.txt', header=None)
config = setup.config('', '', '', stopwords)

In [None]:
df = preprocessing.meta_features(df, stopwords)
train_x, train_y, test_x, test_y, scaler, vectorizer, selector = preprocessing.vectorize_dataset(config, df)

In [None]:
vocab = vectorizer.vocabulary_
selected = selector.get_support()
keys = vocab.keys()
keys = np.array(list(keys))
ngrams = keys[selected]
np.savetxt("ngrams_7500.txt", ngrams, fmt='%s', delimiter='\n')

In [None]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, input_dim=7506, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(32, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid' )
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(train_x, train_y, epochs=5, verbose=1, validation_data=(test_x, test_y))

In [None]:
layers = model.layers
new_model = tf.keras.models.Sequential(layers[:-1])

In [None]:
new_model.input_shape

In [None]:
x_data = np.concatenate((train_x, test_x), axis=0)
y_data = np.concatenate((train_y, test_y), axis=0)
x_values = new_model(x_data)

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2, perplexity=50, n_iter=10000, verbose=1, random_state=123)
v = tsne.fit_transform(x_values) 

In [None]:
df_tsne = pd.DataFrame()
df_tsne["class"] = y_data
df_tsne["x"] = v[:,0]
df_tsne["y"] = v[:,1]
df_tsne["text"] = df["text"]
df_tsne["id"] = df.index
df_tsne.to_csv("../Classification/plots/tsne.csv")