In [1146]:
import pandas as pd 
import numpy as np 

## Читаем данные, смотрим на тренировочный набор

In [1147]:
df_train = pd.read_csv ("train.tsv", sep="\t")
df_test = pd.read_csv ("test.tsv", sep="\t")
df_val = pd.read_csv ("val.tsv", sep="\t")

df_train

Unnamed: 0,is_virus,filename,libs
0,1,VirusShare_44a578f74f71c566b2ea3702a581bf0d,kernel32.dll
1,1,VirusShare_852c5ccb274e5cac09cbaa0bf6ee9d76,"kernel32.dll,advapi32.dll,comctl32.dll,comdlg3..."
2,1,VirusShare_d7e9e47398dd04225d21fa3afc3acc5a,"msvcrt.dll,kernel32.dll,user32.dll,advapi32.dl..."
3,1,VirusShare_04dee410f3aef6ff2fbbf58cc6a38fbf,"kernel32.dll,user32.dll,gdi32.dll,advapi32.dll..."
4,1,VirusShare_04ce876f84647d16e6ef35a6deae629f,"kernel32.dll,advapi32.dll,msvcrt.dll,user32.dl..."
...,...,...,...
16285,1,VirusShare_0602df3a416a0fd1342f5e88fc6ecc46,"kernel32.dll,user32.dll,advapi32.dll,oleaut32...."
16286,0,bitsadmin.exe,"msvcrt.dll,api-ms-win-downlevel-kernel32-l1-1-..."
16287,1,2019-03-15T04_42_24+00_00_19739.exe,c:\\\\\\/\/\/\/\//\\\\\\\\\\\\\\//////\\/\/\/w...
16288,1,VirusShare_20ad408d92588f2871f5b2ffb0de819c,msvbvm60.dll


## Оставляем только те строки, которые содержат .dll

In [1148]:
pattern = r'\.dll'

df_train = df_train[df_train['libs'].str.contains(pattern)]
df_train

Unnamed: 0,is_virus,filename,libs
0,1,VirusShare_44a578f74f71c566b2ea3702a581bf0d,kernel32.dll
1,1,VirusShare_852c5ccb274e5cac09cbaa0bf6ee9d76,"kernel32.dll,advapi32.dll,comctl32.dll,comdlg3..."
2,1,VirusShare_d7e9e47398dd04225d21fa3afc3acc5a,"msvcrt.dll,kernel32.dll,user32.dll,advapi32.dl..."
3,1,VirusShare_04dee410f3aef6ff2fbbf58cc6a38fbf,"kernel32.dll,user32.dll,gdi32.dll,advapi32.dll..."
4,1,VirusShare_04ce876f84647d16e6ef35a6deae629f,"kernel32.dll,advapi32.dll,msvcrt.dll,user32.dl..."
...,...,...,...
16284,1,VirusShare_46212ffc4b81c5acd0d727d0343d586d,kernel32.dll
16285,1,VirusShare_0602df3a416a0fd1342f5e88fc6ecc46,"kernel32.dll,user32.dll,advapi32.dll,oleaut32...."
16286,0,bitsadmin.exe,"msvcrt.dll,api-ms-win-downlevel-kernel32-l1-1-..."
16288,1,VirusShare_20ad408d92588f2871f5b2ffb0de819c,msvbvm60.dll


In [1149]:
# Функция для конвертации строки в список и сотрировки по алфавиту
def encode_libraries(lib_str):
    libraries = lib_str.split(',')
    sorted_libraries = sorted(libraries)
    return sorted_libraries

In [1150]:
df_train['libs'] = df_train['libs'].apply(encode_libraries)
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['libs'] = df_train['libs'].apply(encode_libraries)


Unnamed: 0,is_virus,filename,libs
0,1,VirusShare_44a578f74f71c566b2ea3702a581bf0d,[kernel32.dll]
1,1,VirusShare_852c5ccb274e5cac09cbaa0bf6ee9d76,"[advapi32.dll, comctl32.dll, comdlg32.dll, gdi..."
2,1,VirusShare_d7e9e47398dd04225d21fa3afc3acc5a,"[advapi32.dll, comctl32.dll, gdi32.dll, gdiplu..."
3,1,VirusShare_04dee410f3aef6ff2fbbf58cc6a38fbf,"[advapi32.dll, atl.dll, gdi32.dll, kernel32.dl..."
4,1,VirusShare_04ce876f84647d16e6ef35a6deae629f,"[advapi32.dll, kernel32.dll, msvcrt.dll, user3..."
...,...,...,...
16284,1,VirusShare_46212ffc4b81c5acd0d727d0343d586d,[kernel32.dll]
16285,1,VirusShare_0602df3a416a0fd1342f5e88fc6ecc46,"[advapi32.dll, advapi32.dll, advapi32.dll, ker..."
16286,0,bitsadmin.exe,"[api-ms-win-core-delayload-l1-1-0.dll, api-ms-..."
16288,1,VirusShare_20ad408d92588f2871f5b2ffb0de819c,[msvbvm60.dll]


## Используем MultiLabelBinarizer для бинарного кодирования по библиотекам

In [1151]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
encoded_libraries = pd.DataFrame(mlb.fit_transform(df_train['libs']), columns=mlb.classes_, index=df_train.index)

# Объединение закодированных значений с исходным DataFrame
df_train = pd.concat([df_train, encoded_libraries], axis=1)

# Удаление столбца с исходными библиотеками
df_train = df_train.drop('libs', axis=1)

df_train

Unnamed: 0,is_virus,filename,.rsrc,3cxuicontrols.dll,3cxvoipphone.dll,ab.dll,abview.dll,acad.exe,acadaptersinfo.dll,acdb17.dll,...,xmllite.dll,xolehlp.dll,xpcom.dll,xpsservices.dll,xtp9601lib.dll,zlib.dll,zlib1.dll,zlibwapi.dll,znzin.dll,zsikdy.dll
0,1,VirusShare_44a578f74f71c566b2ea3702a581bf0d,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,VirusShare_852c5ccb274e5cac09cbaa0bf6ee9d76,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,VirusShare_d7e9e47398dd04225d21fa3afc3acc5a,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,VirusShare_04dee410f3aef6ff2fbbf58cc6a38fbf,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,VirusShare_04ce876f84647d16e6ef35a6deae629f,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16284,1,VirusShare_46212ffc4b81c5acd0d727d0343d586d,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16285,1,VirusShare_0602df3a416a0fd1342f5e88fc6ecc46,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16286,0,bitsadmin.exe,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16288,1,VirusShare_20ad408d92588f2871f5b2ffb0de819c,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Удаляем столбик filename, так как его всё равно нет в df_test

In [1152]:
df_train = df_train.drop(columns='filename')
is_virus_train = df_train['is_virus']
df_train

Unnamed: 0,is_virus,.rsrc,3cxuicontrols.dll,3cxvoipphone.dll,ab.dll,abview.dll,acad.exe,acadaptersinfo.dll,acdb17.dll,ace.dll,...,xmllite.dll,xolehlp.dll,xpcom.dll,xpsservices.dll,xtp9601lib.dll,zlib.dll,zlib1.dll,zlibwapi.dll,znzin.dll,zsikdy.dll
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16284,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16285,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16286,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16288,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Работаем с df_test

In [1153]:
# df_test['libs'] = df_test['libs'].apply(sort_libraries)
df_test['libs'] = df_test['libs'].apply(encode_libraries)

In [1154]:
# Инициализация MultiLabelBinarizer и применение кодирования
mlb = MultiLabelBinarizer()
encoded_libraries = pd.DataFrame(mlb.fit_transform(df_test['libs']), columns=mlb.classes_, index=df_val.index)

# Объединение закодированных значений с исходным DataFrame
df_test = pd.concat([df_test, encoded_libraries], axis=1)

# Удаление столбца с исходными библиотеками
df_test = df_test.drop('libs', axis=1)

df_test

Unnamed: 0,ace.dll,addressbook-vc140-mt-32.dll,advapi32.dll,apdadrv.dll,api-ms-win-appmodel-runtime-l1-1-0.dll,api-ms-win-core-apiquery-l1-1-0.dll,api-ms-win-core-atoms-l1-1-0.dll,api-ms-win-core-com-l1-1-0.dll,api-ms-win-core-com-l1-1-1.dll,api-ms-win-core-console-l1-1-0.dll,...,wldap32.dll,ws2_32.dll,wsnmp32.dll,wsock32.dll,wtsapi32.dll,xerces-vc140-mt-32-3_2_1.dll,xinput1_4.dll,xmllite.dll,zf_cef.dll,zlib1.dll
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1196,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1198,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Делаем те же самые колонки в df_train, которые есть в df_test

In [1155]:
# Оставляем только те колонки в df_train, которые есть в df_test
common_columns = df_test.columns.intersection(df_train.columns)
df_train = df_train[common_columns]

# Находим колонки, которые есть в df_test, но отсутствуют в df_train
missing_columns = df_test.columns.difference(df_train.columns)

# Добавляем отсутствующие колонки в df_train с пустыми значениями
df_train = pd.concat([df_train, pd.DataFrame(columns=missing_columns)], axis=1)

# Добавляем колонку is virus
df_train['is_virus'] = is_virus_train
# Вывод результата
df_train

Unnamed: 0,ace.dll,addressbook-vc140-mt-32.dll,advapi32.dll,api-ms-win-appmodel-runtime-l1-1-0.dll,api-ms-win-core-apiquery-l1-1-0.dll,api-ms-win-core-atoms-l1-1-0.dll,api-ms-win-core-com-l1-1-0.dll,api-ms-win-core-com-l1-1-1.dll,api-ms-win-core-console-l1-1-0.dll,api-ms-win-core-console-l2-1-0.dll,...,twinapi.appcore.dll,ufat.dll,urlmon,vix.dll,vjsc.dll,vmsif.dll,xerces-vc140-mt-32-3_2_1.dll,xinput1_4.dll,zf_cef.dll,is_virus
0,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,1
1,0,0,1,0,0,0,0,0,0,0,...,,,,,,,,,,1
2,0,0,1,0,0,0,0,0,0,0,...,,,,,,,,,,1
3,0,0,1,0,0,0,0,0,0,0,...,,,,,,,,,,1
4,0,0,1,0,0,0,0,0,0,0,...,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16284,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,1
16285,0,0,1,0,0,0,0,0,0,0,...,,,,,,,,,,1
16286,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,0
16288,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,1


## Приступаем к работе с df_val

In [1156]:
df_val = df_val.drop(columns='filename')

In [1157]:
df_val

Unnamed: 0,is_virus,libs
0,1,"kernel32.dll,advapi32.dll,comctl32.dll,gdi32.d..."
1,0,"msys-intl-8.dll,msys-2.0.dll,kernel32.dll"
2,1,kernel32.dll
3,1,kernel32.dll
4,1,msvbvm60.dll
...,...,...
1195,1,mscoree.dll
1196,0,"version.dll,kernel32.dll,user32.dll,advapi32.d..."
1197,1,"kernel32.dll,gdi32.dll,msvcrt.dll,user32.dll"
1198,0,"msvcrt.dll,api-ms-win-eventing-classicprovider..."


In [1158]:
df_val['libs'] = df_val['libs'].apply(encode_libraries)

In [1159]:
# Инициализация MultiLabelBinarizer и применение кодирования
mlb = MultiLabelBinarizer()
encoded_libraries = pd.DataFrame(mlb.fit_transform(df_val['libs']), columns=mlb.classes_, index=df_val.index)

# Объединение закодированных значений с исходным DataFrame
df_val = pd.concat([df_val, encoded_libraries], axis=1)

# Удаление столбца с исходными библиотеками
df_val = df_val.drop('libs', axis=1)

df_val

Unnamed: 0,is_virus,activeds.dll,adbwinapi.dll,advapi32.dll,api-ms-win-appmodel-runtime-l1-1-0.dll,api-ms-win-appmodel-unlock-l1-1-0.dll,api-ms-win-core-apiquery-l1-1-0.dll,api-ms-win-core-com-l1-1-0.dll,api-ms-win-core-com-l1-1-1.dll,api-ms-win-core-console-l1-1-0.dll,...,wsmsvc.dll,wsnmp32.dll,wsock32.dll,wtsapi32.dll,wttlog.dll,xgraphic32.dll,xinput1_3.dll,xmlide.dll,xmllite.dll,zlib1.dll
0,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1196,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1197,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1198,0,0,0,0,0,0,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [1160]:
# Оставляем только те колонки в df_val, которые есть в df_train
common_columns_val = df_train.columns.intersection(df_val.columns)
df_val = df_val[common_columns_val]

# Находим колонки, которые есть в df_train, но отсутствуют в df_val
missing_columns_val = df_train.columns.difference(df_val.columns)

# Добавляем отсутствующие колонки в df_val с пустыми значениями
df_val = pd.concat([df_val, pd.DataFrame(columns=missing_columns_val)], axis=1)

# Вывод результата
df_val

Unnamed: 0,advapi32.dll,api-ms-win-appmodel-runtime-l1-1-0.dll,api-ms-win-core-apiquery-l1-1-0.dll,api-ms-win-core-com-l1-1-0.dll,api-ms-win-core-com-l1-1-1.dll,api-ms-win-core-console-l1-1-0.dll,api-ms-win-core-console-l2-1-0.dll,api-ms-win-core-debug-l1-1-0.dll,api-ms-win-core-delayload-l1-1-0.dll,api-ms-win-core-delayload-l1-1-1.dll,...,vmomi.dll,vnetlib.dll,vulkan-1.dll,wbemcomn.dll,winbrand.dll,wkscli.dll,wlanapi.dll,xerces-vc140-mt-32-3_2_1.dll,xinput1_4.dll,zf_cef.dll
0,1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
2,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
3,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
4,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1196,1,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1197,0,0,0,0,0,0,0,0,0,0,...,,,,,,,,,,
1198,0,0,1,1,0,0,0,1,1,1,...,,,,,,,,,,


##  Убираем все NaN значения, заменяем на нули

In [1161]:
# Заменить все NaN на 0 в df_train
df_train = df_train.fillna(0)

# Заменить все NaN на 0 в df_val
df_val = df_val.fillna(0)

# Заменить все NaN на 0 в df_test
df_test = df_test.fillna(0)

## Приводим в алфавитный порядок колонки в каждой таблице для дальшейшего обучения

In [1162]:
# Сортировка столбцов по алфавиту в df_train
df_train = df_train.sort_index(axis=1)

# Сортировка столбцов по алфавиту в df_val
df_val = df_val.sort_index(axis=1)

# Сортировка столбцов по алфавиту в df_test
df_test = df_test.sort_index(axis=1)


In [1163]:
X_train = df_train.drop(columns=['is_virus'])
y_train = df_train['is_virus']

X_val = df_val.drop(columns=['is_virus'])
y_val = df_val['is_virus']

X_test = df_test


In [1164]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report

# Создание и обучение модели Logistic Regression
model = LogisticRegression()
model.fit(X_train, y_train)

# Предсказания на валидационном наборе
pred = model.predict(X_val)

# model = LogisticRegression(random_state=1)
# model.fit(X_train,y_train)
# pred = model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [1165]:
f1_score(y_val, pred)

0.9061224489795918

In [1166]:
from sklearn.metrics import confusion_matrix

print(classification_report(y_val, pred, digits=4))
tn, fp, fn, tp = confusion_matrix(pred, y_val).ravel()
print(f'true negative: {tn}')
print(f'false positive: {fp}')
print(f'false negative: {fn}')
print(f'true positive: {tp}')


              precision    recall  f1-score   support

           0     0.9193    0.6550    0.7650       400
           1     0.8492    0.9712    0.9061       800

    accuracy                         0.8658      1200
   macro avg     0.8842    0.8131    0.8355      1200
weighted avg     0.8726    0.8658    0.8591      1200

true negative: 262
false positive: 23
false negative: 138
true positive: 777


## Сохраняем все метрики и параметры в 'validation.txt' по ТЗ

In [1167]:
# Расчет confusion matrix и вывод classification report
tn, fp, fn, tp = confusion_matrix(y_val, pred).ravel()
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

# Сохранение результатов в файл 'validation.txt'
with open('validation.txt', 'w') as f:
    f.write(f'True positive: {tp}\n')
    f.write(f'False positive: {fp}\n')
    f.write(f'False negative: {fn}\n')
    f.write(f'True negative: {tn}\n')
    f.write(f'Accuracy: {accuracy:.4f}\n')
    f.write(f'Precision: {precision:.4f}\n')
    f.write(f'Recall: {recall:.4f}\n')
    f.write(f'F1: {f1:.4f}\n')

In [1168]:
df_test

Unnamed: 0,ace.dll,addressbook-vc140-mt-32.dll,advapi32.dll,apdadrv.dll,api-ms-win-appmodel-runtime-l1-1-0.dll,api-ms-win-core-apiquery-l1-1-0.dll,api-ms-win-core-atoms-l1-1-0.dll,api-ms-win-core-com-l1-1-0.dll,api-ms-win-core-com-l1-1-1.dll,api-ms-win-core-console-l1-1-0.dll,...,wldap32.dll,ws2_32.dll,wsnmp32.dll,wsock32.dll,wtsapi32.dll,xerces-vc140-mt-32-3_2_1.dll,xinput1_4.dll,xmllite.dll,zf_cef.dll,zlib1.dll
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1196,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1198,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Итог. Предсказываем вирус или нет и сохраняем в prediction.txt

In [1169]:
X_test = df_test
pred = model.predict(X_test)

In [1170]:
with open('prediction.txt', 'w') as f:
    f.write('prediction\n')
    np.savetxt(f, pred, fmt='%d')