# Импорт библиотек

In [1]:
import pandas as pd

from sklearn.pipeline import Pipeline
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
random_state = 42

# Загрузка данных

In [5]:
train = pd.read_csv("Data/train.csv")

train = train.drop_duplicates()

X_train = train["url"]
y_train = train["result"]

In [6]:
test = pd.read_csv("Data/test.csv")
X_test = test['url']

# Предобработка данных

In [7]:
tokenizer = RegexpTokenizer(r'[\w\d]+')
vectorizer = TfidfVectorizer(max_features=2048)

In [8]:
X_train = vectorizer.fit_transform(X_train.apply(lambda x: ' '.join(tokenizer.tokenize(x))))
X_test = vectorizer.transform(X_test.apply(lambda x: ' '.join(tokenizer.tokenize(x))))

# Создание файлов с предсказаниями

## RandomForestClassifier

In [9]:
model = RandomForestClassifier(random_state=random_state)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [10]:
submit = pd.read_csv("Data/sample_submit.csv")
submit["Predicted"] = y_pred

In [11]:
submit["Predicted"].value_counts()

Predicted
0    10262
1     5738
Name: count, dtype: int64

In [12]:
filename = "submit_tfidf_2048_rfc_d.csv"
submit.to_csv(filename, index=False)

In [13]:
pd.read_csv(filename)

Unnamed: 0,Id,Predicted
0,0,0
1,1,1
2,2,1
3,3,0
4,4,0
...,...,...
15995,15995,0
15996,15996,1
15997,15997,0
15998,15998,0


## CatBoostClassifier

In [None]:
model = CatBoostClassifier(random_seed=random_state, eval_metric='Accuracy', logging_level='Silent')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
submit = pd.read_csv("Data/sample_submit.csv")
submit["Predicted"] = y_pred

In [None]:
submit["Predicted"].value_counts()

In [None]:
filename = "submit_tfidf_2048_cbc_d.csv"
submit.to_csv(filename, index=False)

In [None]:
pd.read_csv(filename)