# Подготовка к работе

In [35]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, MaxPooling1D, Conv1D, Dropout, Flatten
from tensorflow.keras import utils
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from google.colab import files
import pandas as pd
import numpy as np

В качестве датасета будут использоваться данные с соревнования kaggle, поэтому необходимо загрузить файл с oauth_token.

In [36]:
f = files.upload()

Saving kaggle.json to kaggle.json


In [37]:
!sudo apt-get update
!sudo apt-get install unzip

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [44.8 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]
Get:7 https://ppa.launchpadcontent.net/c2d4u.team/c2d4u4.0+/ubuntu jammy InRelease [18.1 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]
Get:9 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1,009 kB]
Get:10 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,274 kB]
Hit:11 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [1,398 kB]
Ge

In [38]:
!mkdir /root/.kaggle
!mv kaggle.json /root/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [39]:
!chmod 600 /root/.kaggle/kaggle.json

In [40]:
!kaggle competitions download -c nlp-getting-started

nlp-getting-started.zip: Skipping, found more recently modified local copy (use --force to force download)


In [41]:
!unzip nlp-getting-started.zip

Archive:  nlp-getting-started.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Работа с данными

In [42]:
max_words = 10000
max_len = 100

In [43]:
train_data = 'train.csv'
test_data = 'test.csv'

In [44]:
df_train = pd.read_csv(train_data)
df_test = pd.read_csv(test_data)

In [45]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [46]:
x_train = df_train.text.fillna('')
y_train = df_train['target']

In [47]:
test_ids = df_test['id']

Посмотрим, как выглядят данные для обучения

In [49]:
x_train[120]

'#WisdomWed BONUS - 5 Minute Daily Habits that could really improve your life. How many do you already do? #lifehacks http://t.co/TBm9FQb8cW'

Перед началом работы также нужно преобразовать слова в числа. Для этого используется tokenizer

In [50]:
tokenizer = Tokenizer(num_words=max_words)

In [51]:
tokenizer.fit_on_texts(x_train)

In [52]:
x_train = tokenizer.texts_to_sequences(x_train)

Разделение данных на тестовые данные (20% от всего датасета) и на обучающие

In [53]:
proportion_data = len(x_train) - int(len(x_train) * 0.2)
x_test = x_train[proportion_data:]
y_test = y_train[proportion_data:]
x_train = x_train[:proportion_data]
y_train = y_train[:proportion_data]

In [54]:
x_train = pad_sequences(x_train, maxlen=max_len, padding='post')
x_test = pad_sequences(x_test, maxlen=max_len, padding='post')

# Работа с моделью

Сохраним лучшую обученную модель в best_model.h5

In [55]:
best_model_path = 'best_model.h5'
callback_checpoint = ModelCheckpoint(
    best_model_path,
    monitor='val_accuracy',
    save_best_only=True,
    verbose=1
)
callbacks_list = [callback_checpoint]

Модель будет состоять из 3-х групп слоёв, 2 из которых состоят из 4 слоев:
* 2 слоя одномерной свертки
* слой подвыборки максимального значения
* слой для снижения переобучения

Последняя группа слоев состоит из:
* слой для преобразования массива в плоский вектор
* полносвязный слой
* слой для снижения переобучения
* полносвязный слой для выдачи конечного результата (0 или 1)

In [56]:
model = Sequential()
model.add(Embedding(max_words, 64, input_length=max_len))

model.add(Conv1D(250, 5, activation='relu'))
model.add(Conv1D(250, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(500, 5, activation='relu'))
model.add(Conv1D(500, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

In [60]:
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [58]:
model.fit(x_train,
          y_train,
          epochs=10,
          validation_split=0.15,
          batch_size=50,
          verbose=1,
          callbacks=callbacks_list)

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.57877, saving model to best_model.h5
Epoch 2/10
  1/104 [..............................] - ETA: 2s - loss: 0.7352 - accuracy: 0.4800

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.57877 to 0.77681, saving model to best_model.h5
Epoch 3/10
Epoch 3: val_accuracy did not improve from 0.77681
Epoch 4/10
Epoch 4: val_accuracy did not improve from 0.77681
Epoch 5/10
Epoch 5: val_accuracy did not improve from 0.77681
Epoch 6/10
Epoch 6: val_accuracy did not improve from 0.77681
Epoch 7/10
Epoch 7: val_accuracy did not improve from 0.77681
Epoch 8/10
Epoch 8: val_accuracy did not improve from 0.77681
Epoch 9/10
Epoch 9: val_accuracy did not improve from 0.77681
Epoch 10/10
Epoch 10: val_accuracy did not improve from 0.77681


<keras.src.callbacks.History at 0x7c63abcdeaa0>

Проверим обученную модель на тестовом наборе данных

In [59]:
model.evaluate(x_test, y_test, verbose=1)



[2.2084503173828125, 0.7411301136016846]

Таким образом, точность модели составила ~70%