In [9]:
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.neighbors import KNeighborsClassifier

In [10]:
TRAINING_SAMPLES = 15000
VALIDATION_SAMPLES = 6000
MAX_WORDS = 10000
MAX_FEATURE_LEN = 1000

In [11]:
train_data = pd.read_csv('../Data/train.csv')

nan_idx = train_data[pd.isnull(train_data.text)].index.tolist()
train_data.loc[nan_idx, 'text'] = ' '

In [12]:
tokenizer = Tokenizer(num_words=MAX_WORDS)

tokenizer.fit_on_texts(train_data.text)
sequences = tokenizer.texts_to_sequences(train_data.text)

print(f'Znaleziono {len(tokenizer.word_index)} unikatowych tokenów.')

data = pad_sequences(sequences, maxlen=MAX_FEATURE_LEN)
labels = train_data.label

print(f'Kształt tensora danych: {data.shape}')
print(f'Kształt tensora etykiet: {labels.shape}')

Znaleziono 238051 unikatowych tokenów.
Kształt tensora danych: (20800, 1000)
Kształt tensora etykiet: (20800,)


In [13]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:TRAINING_SAMPLES]
y_train = labels[:TRAINING_SAMPLES]
x_val = data[TRAINING_SAMPLES : TRAINING_SAMPLES + VALIDATION_SAMPLES]
y_val = labels[TRAINING_SAMPLES : TRAINING_SAMPLES + VALIDATION_SAMPLES]

In [15]:
knn = KNeighborsClassifier(n_neighbors=7)

knn.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=7)

In [17]:
test_data = pd.read_csv('../Data/test.csv')
nan_idx = test_data[pd.isnull(test_data.text)].index.tolist()
test_data.loc[nan_idx, 'text'] = ' '

tokenizer = Tokenizer(num_words=MAX_WORDS)
tokenizer.fit_on_texts(test_data.text)
sequences = tokenizer.texts_to_sequences(test_data.text)

x_test = pad_sequences(sequences, maxlen=MAX_FEATURE_LEN)
y_test = test_data.label


predicted = knn.predict(x_test)
print(f'Accuracy: {np.mean(predicted == y_test) * 100}%')

Accuracy: 55.3076923076923%
