In [11]:
import pandas as pd
import numpy as np

from ydata_profiling import ProfileReport
import plotly.express as px

import nltk
# import spacy

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Загрузим английскую модель spaCy
# nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\McHomak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\McHomak\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\McHomak\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\McHomak\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [12]:
import torch

# Проверка доступности CUDA
print("CUDA available:", torch.cuda.is_available())

if torch.cuda.is_available():
    # Вывод информации о GPU
    print("Device name:", torch.cuda.get_device_name(0))
    print("Number of GPUs:", torch.cuda.device_count())
else:
    print("CUDA is not available.")


CUDA available: True
Device name: NVIDIA GeForce RTX 4060
Number of GPUs: 1


In [13]:
df =  pd.read_csv("website_classification.csv")
df.head()

Unnamed: 0,website_url,text,category
0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel


In [14]:
# profile = ProfileReport(df, title="Profiling Report", explorative=True)
# profile.to_file('data_report.html')

In [15]:
unique_categories_1 = df['category'].unique()
print(unique_categories_1)
print(len(unique_categories_1))

['Travel' 'Social Networking and Messaging' 'News' 'Streaming Services'
 'Sports' 'Photography' 'Law and Government' 'Health and Fitness' 'Games'
 'E-Commerce' 'Forums' 'Food' 'Education' 'Computers and Technology'
 'Business/Corporate' 'Adult']
16


In [16]:
df["text"].values[0]

'official site good hotel accommodation big saving hotel destination worldwide browse hotel review find guarantee good price hotel budget lodging accommodation hotel hotels special offer package special weekend break city break deal budget cheap discount saving select language find deal hotel home try search connect traveller india travel talk community recommend destination flamborough boreland colvend catfield harberton warleggan inspiration trip spot winter wildlife beautiful snowy island bye bye work want spontechnaity tech drive travel vital value maximise travel homes guest love browse property type hotels apartments resorts villa cabins cottage glamping serviced apartment holiday home guest house hostels motels ryokans riads holiday park homestays campsites country house farm stay boats luxury tent self catering accommodation tiny house chapel saint leonards wuqing wuchang saint eval great rowsley instow verified review real guest work start booking follow trip finally review sk

In [17]:
# Подсчет количества записей в каждой категории
category_counts = df['category'].value_counts().reset_index()
category_counts.columns = ['category', 'count']

# Построение горизонтального столбчатого графика
fig = px.bar(
    category_counts,
    x='count',
    y='category',
    orientation='h',
    title='Распределение категорий по количеству',
    labels={'count': 'Количество', 'category': 'Категория'},
    width=800,
    height=600
)

# Сортировка категорий по количеству для лучшей визуализации
fig.update_layout(yaxis={'categoryorder':'total ascending'})

# Отображение графика
fig.show()

In [18]:
# Инициализация лемматизатора и стоп-слов
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text_nltk(text):
    # Токенизация
    tokens = word_tokenize(text)
    
    # Приведение к нижнему регистру и удаление пунктуации
    tokens = [word.lower() for word in tokens if word.isalpha()]
    
    # Удаление стоп-слов
    tokens = [word for word in tokens if word not in stop_words]
    
    # Лемматизация
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Объединение токенов обратно в строку
    processed_text = ' '.join(tokens)
    return processed_text

In [19]:
df['processed_text'] = df['text'].apply(preprocess_text_nltk)

In [20]:
def generate_ngrams(text, n):
    tokens = text.split()
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return ['_'.join(ngram) for ngram in ngrams]

# Пример для добавления биграмм
df['bigrams'] = df['processed_text'].apply(lambda x: generate_ngrams(x, 2))

In [21]:
df.head()

Unnamed: 0,website_url,text,category,processed_text,bigrams
0,https://www.booking.com/index.html?aid=1743217,official site good hotel accommodation big sav...,Travel,official site good hotel accommodation big sav...,"[official_site, site_good, good_hotel, hotel_a..."
1,https://travelsites.com/expedia/,expedia hotel book sites like use vacation wor...,Travel,expedia hotel book site like use vacation work...,"[expedia_hotel, hotel_book, book_site, site_li..."
2,https://travelsites.com/tripadvisor/,tripadvisor hotel book sites like previously d...,Travel,tripadvisor hotel book site like previously de...,"[tripadvisor_hotel, hotel_book, book_site, sit..."
3,https://www.momondo.in/?ispredir=true,cheap flights search compare flights momondo f...,Travel,cheap flight search compare flight momondo fin...,"[cheap_flight, flight_search, search_compare, ..."
4,https://www.ebookers.com/?AFFCID=EBOOKERS-UK.n...,bot create free account create free account si...,Travel,bot create free account create free account si...,"[bot_create, create_free, free_account, accoun..."


In [22]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['processed_text'])
y = df['category']

In [23]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)
print(X_resampled.shape, y_resampled.shape)

(1824, 54326) (1824,)


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [25]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [26]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

                                 precision    recall  f1-score   support

                          Adult       1.00      1.00      1.00        21
             Business/Corporate       0.96      0.86      0.91        29
       Computers and Technology       0.62      0.95      0.75        21
                     E-Commerce       1.00      0.86      0.93        22
                      Education       0.90      0.95      0.92        19
                           Food       0.91      1.00      0.95        21
                         Forums       0.90      1.00      0.95        19
                          Games       0.94      0.94      0.94        16
             Health and Fitness       0.96      1.00      0.98        24
             Law and Government       0.96      1.00      0.98        23
                           News       1.00      0.83      0.91        29
                    Photography       1.00      0.97      0.98        30
Social Networking and Messaging       1.00      0.