In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Dataset

In [2]:
# Define the binary neural network model
class BinaryNetwork(nn.Module):
    def __init__(self, input_size):
        super(BinaryNetwork, self).__init__()
        self.conv1 = nn.Conv1d(input_size, 128, kernel_size=9, padding=0)
        self.maxpool1 = nn.MaxPool1d(kernel_size=2)
        self.conv2 = nn.Conv1d(128, 128, kernel_size=9, padding=0)
        self.maxpool2 = nn.MaxPool1d(kernel_size=2)
        self.conv3 = nn.Conv1d(128, 128, kernel_size=9, padding=0)
        self.maxpool3 = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        x = self.conv1(x)
        x = self.maxpool1(x)
        x = torch.sign(x)
        x = self.conv2(x)
        x = self.maxpool2(x)
        x = torch.sign(x)
        x = self.conv3(x)
        x = self.maxpool3(x)
        x = torch.sign(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        x = torch.sign(x)
        return x

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('Device:', torch.cuda.get_device_name(torch.cuda.current_device()))

Device: NVIDIA GeForce RTX 3090


In [4]:
import numpy as np
import os
import pandas as pd
from sklearn.model_selection import train_test_split

# Set the path to the directory containing the .npy files
data_dir = "/home/vadim/Documents/ctci-python/yandex/yandex_43412_ml/ml-intern-binary-biometry-contest/binary_train_data/binary_train"

# Set the path to the TSV file containing the labels
labels_file = "/home/vadim/Documents/ctci-python/yandex/yandex_43412_ml/ml-intern-binary-biometry-contest/binary_train.tsv"

# Load the filenames from the .npy files
file_list = []

for file_name in os.listdir(data_dir):
    if file_name.endswith(".npy"):
        file_path = os.path.join(data_dir, file_name)
        file_list.append(file_path)

# Load the labels from the TSV file
labels_df = pd.read_csv(labels_file, header=None, usecols=[0,1], names=['id', 'label'], delimiter="\t")
labels_df.head()

Unnamed: 0,id,label
0,0,1
1,1,1
2,2,0
3,3,1
4,4,1


In [16]:
class GenderDataset(Dataset):
    def __init__(self, file_list, label_list):
        self.file_list = file_list
        self.label_list = label_list

    def __len__(self):
        return len(self.file_list)

    def __getitem__(self, idx):
        file_path = self.file_list[idx]
        label = self.label_list[idx]
        # Load the features from the file and return as a tensor
        features = torch.from_numpy(np.load(file_path))
        return features, label

# Function to quantize and save the model
def quantize_and_save_model(model, save_path):
    with open(save_path, 'w') as fout:
        for weights in model.parameters():
            for param in torch.where(torch.flatten(weights) > 0, 1, -1):
                fout.write(str(param.item()) + '\n')

In [6]:
labels = labels_df["label"].values

# Split the dataset into training and test sets
train_files, test_files, train_labels, test_labels = train_test_split(
    file_list, labels, test_size=0.2)

# Save the split datasets as .npy files
np.save("train_files.npy", train_files)
np.save("test_files.npy", test_files)
np.save("train_labels.npy", train_labels)
np.save("test_labels.npy", test_labels)

In [7]:


# Prepare the training and test data
train_files = np.load("train_files.npy")
train_labels = np.load("train_labels.npy")
# split npy file into train and test

test_files = np.load("test_files.npy")
test_labels = np.load("test_labels.npy")


train_dataset = GenderDataset(train_files, train_labels)
test_dataset = GenderDataset(test_files, test_labels)


import torch.nn.functional as F

 # Find the maximum length of feature tensors in both datasets
max_length = max(len(features) for features, _ in train_dataset + test_dataset)

 # Function to pad the feature tensors
def pad_features(features, length):
    padded_features = F.pad(features, pad=(0, 0, 0, max_length - len(features)))
    return padded_features

# Apply padding to the feature tensors in the datasets
train_dataset = [(pad_features(features, max_length), label) for features, label in train_dataset]
test_dataset = [(pad_features(features, max_length), label) for features, label in test_dataset]

for features, label in train_dataset:
    print(features.shape)
    break


# Set the batch size and number of epochs
batch_size = 32
num_epochs = 10

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


input_size = train_dataset[0][0].shape[0]  # Get the input size from the first feature tensor
print('Input size:', input_size)
model = BinaryNetwork(input_size)
#model = BinaryNetwork().to(device)

# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())



torch.Size([500, 64])
Input size: 500


In [9]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for batch_features, batch_labels in train_loader:
        #batch_labels = batch_labels.type(torch.ByteTensor)
        optimizer.zero_grad()
        outputs = model(batch_features.type(torch.FloatTensor))
        loss = criterion(outputs, batch_labels.float().view(-1, 1))
        loss.backward()
        optimizer.step()

    # Evaluate on the test set
    model.eval()
    total = 0
    correct = 0
    with torch.no_grad():
        for batch_features, batch_labels in test_loader:
            outputs = model(batch_features.type(torch.FloatTensor))
            predicted = torch.round(torch.sigmoid(outputs)).long()
            total += batch_labels.size(0)
            correct += (predicted == batch_labels).sum().item()
    accuracy = correct / total
    print('Epoch [{}/{}], Accuracy: {:.4f}'.format(epoch+1, num_epochs, accuracy))



Epoch [1/10], Accuracy: 16.0022
Epoch [2/10], Accuracy: 16.0022
Epoch [3/10], Accuracy: 16.0022
Epoch [4/10], Accuracy: 16.0022
Epoch [5/10], Accuracy: 16.0022
Epoch [6/10], Accuracy: 16.0022
Epoch [7/10], Accuracy: 16.0022
Epoch [8/10], Accuracy: 16.0022
Epoch [9/10], Accuracy: 16.0022
Epoch [10/10], Accuracy: 16.0022


In [17]:
quantize_and_save_model(model, 'model_weights.txt')

In [18]:
quantize_and_save_model(model, 'data.csv')

In [19]:
import csv
import tempfile
import shutil

def remove_csv_header(input_file, output_file):
    # Create a temporary file for writing the modified CSV data
    temp_file = tempfile.NamedTemporaryFile(mode='w', delete=False)

    with open(input_file, 'r') as f_in, temp_file:
        reader = csv.reader(f_in)
        writer = csv.writer(temp_file)

        # Skip the header row
        next(reader)

        # Write the remaining rows to the temporary file
        for row in reader:
            writer.writerow(row)

    # Replace the input file with the temporary file
    shutil.move(temp_file.name, output_file)

# Usage example
input_file = 'data.csv'
output_file = 'output.csv'
remove_csv_header(input_file, output_file)

In [14]:
# Quantize and save the model
quantize_and_save_model(model, 'data.csv')

In [20]:
quantize_and_save_model(model, 'data.npy')

In [23]:
def quantize_and_save_model(model, save_path):
    with open(save_path, 'w') as fout:
        for layer in model.children():
            if isinstance(layer, nn.Conv1d):
                weights = layer.weight.detach().cpu().numpy()
                bias = layer.bias.detach().cpu().numpy()
                weights = np.where(weights > 0, 1, -1)
                bias = np.where(bias > 0, 1, -1)
                weights = weights.flatten()
                bias = bias.flatten()
                fout.write('\n'.join(map(str, weights)))
                fout.write('\n')
                fout.write('\n'.join(map(str, bias)))
                fout.write('\n')
            elif isinstance(layer, nn.Linear):
                weights = layer.weight.detach().cpu().numpy()
                bias = layer.bias.detach().cpu().numpy()
                weights = np.where(weights > 0, 1, -1)
                bias = np.where(bias > 0, 1, -1)
                weights = weights.flatten()
                bias = bias.flatten()
                fout.write('\n'.join(map(str, weights)))
                fout.write('\n')
                fout.write('\n'.join(map(str, bias)))
                fout.write('\n')
quantize_and_save_model(model, 'data.csv')

In [15]:
import gzip

def compress_csv(input_file, output_file):
    with open(input_file, 'rt') as f_in:
        with gzip.open(output_file, 'wt') as f_out:
            f_out.writelines(f_in)

# Usage example
input_file = 'data.csv'
output_file = 'data.csv.gz'
compress_csv(input_file, output_file)

In [22]:
import csv
import math

def cut_csv_by_half(input_file, output_file1, output_file2):
    # Read the input CSV file
    with open(input_file, 'r') as file:
        reader = csv.reader(file)
        rows = list(reader)

    # Determine the midpoint to divide the rows
    midpoint = math.ceil(len(rows) / 2)

    # Split the rows into two halves
    rows1 = rows[:midpoint]
    rows2 = rows[midpoint:]

    # Write the first half to the first output CSV file
    with open(output_file1, 'w', newline='') as file1:
        writer1 = csv.writer(file1)
        writer1.writerows(rows1)

    # Write the second half to the second output CSV file
    with open(output_file2, 'w', newline='') as file2:
        writer2 = csv.writer(file2)
        writer2.writerows(rows2)

# Usage example
input_file = 'output1.csv'
output_file1 = 'output3.csv'
output_file2 = 'output4.csv'
cut_csv_by_half(input_file, output_file1, output_file2)

In [None]:
import pandas as pd

def cut_csv_by_half(input_file, output_file1, output_file2):
    # Read the input CSV file
    df = pd.read_csv(input_file)

    # Split the dataframe into two halves
    midpoint = len(df) // 2
    df1 = df.iloc[:midpoint-100000]
    df2 = df.iloc[midpoint:]

    # Write the first half to the first output CSV file
    df1.to_csv(output_file1, index=False)

    # Write the second half to the second output CSV file
    df2.to_csv(output_file2, index=False)

# Usage example
input_file = 'data.csv'
output_file1 = 'output1.csv'
output_file2 = 'output2.csv'
cut_csv_by_half(input_file, output_file1, output_file2)

In [25]:
import pandas as pd
import os

def split_csv_by_size(input_file, output_dir, chunk_size=1):
    # Read the input CSV file
    df = pd.read_csv(input_file)

    # Calculate the number of rows per chunk (approximately 1MB)
    target_size = 1 * 1024 * 1024  # 1MB in bytes
    num_rows = len(df)
    rows_per_chunk = int(target_size / num_rows)

    # Create the output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    # Split the dataframe into smaller chunks
    chunks = [df[i:i + rows_per_chunk] for i in range(0, num_rows, rows_per_chunk)]

    # Write each chunk to a separate CSV file
    for i, chunk in enumerate(chunks):
        output_file = os.path.join(output_dir, f"chunk_{i+1}.csv")
        chunk.to_csv(output_file, index=False)

# Usage example
input_file = 'data.csv'
output_dir = 'output_chunks'
split_csv_by_size(input_file, output_dir)

KeyboardInterrupt: 