In [32]:
import pandas as pd
import numpy as np
pd.options.mode.chained_assignment = None

In [35]:
dataset = pd.read_parquet("../data/train_pairs.parquet")

In [36]:
dataset.shape

(306540, 3)

In [111]:
def remove_symmetric_rows(dataset: pd.DataFrame, col1: str="variantid1", 
                          col2: str="variantid2") -> pd.DataFrame:
    m = pd.DataFrame(np.sort(dataset[[col1, col2]], axis=1), index=dataset.index).duplicated()
    return dataset[~m]

## Merge data with same variant1 

In [5]:
dataset_v1 = dataset.merge(dataset, how="inner", left_on="variantid1", right_on="variantid1")

### A=B, B=C => A=C; A/=B, B=C => A/=C

In [82]:
negative_part_v1 = dataset_v1[(dataset_v1["variantid2_x"] != dataset_v1["variantid2_y"]) & 
         (dataset_v1["target_x"] != dataset_v1["target_y"])]
negative_part_v1.loc[:, "target"] = 0.0
positive_part_v1 = dataset_v1[(dataset_v1["variantid2_x"] != dataset_v1["variantid2_y"]) & 
         (dataset_v1["target_x"] == 1.0) &
         (dataset_v1["target_y"] == 1.0)]
positive_part_v1.loc[:, "target"] = 1.0
v1_data = pd.concat([positive_part_v1, negative_part_v1], ignore_index=True)
# drop duplicates if appear 
v1_data = v1_data.drop_duplicates(subset=["variantid2_x", "variantid2_y", "target"])
v1_data.shape

(135668, 6)

In [83]:
# Remove errors from initial datasets (A=B, B=C => A=C, but A/=D, D=C => A/=C)
drop_part_v1 = v1_data[v1_data.duplicated(subset=["variantid2_x", "variantid2_y"], keep=False)]
v1_data = v1_data.drop_duplicates(subset=["variantid2_x", "variantid2_y"], keep=False)

In [86]:
# Prepare for merging 
v1_data = v1_data[["variantid2_x", "variantid2_y", "target"]]
v1_data = v1_data.rename(columns={"variantid2_x": "variantid1",  "variantid2_y":"variantid2"})

In [87]:
v1_data = remove_symmetric_rows(v1_data, "variantid1", "variantid2")
v1_data.shape

(67578, 3)

## Merge data with same variant2

In [56]:
dataset_v2 = dataset.merge(dataset, how="inner", left_on="variantid2", right_on="variantid2")

In [57]:
negative_part_v2 = dataset_v2[(dataset_v2["variantid1_x"] != dataset_v2["variantid1_y"]) & 
         (dataset_v2["target_x"] != dataset_v2["target_y"])]
negative_part_v2.loc[:, "target"] = 0.0
positive_part_v2 = dataset_v2[(dataset_v2["variantid1_x"] != dataset_v2["variantid1_y"]) & 
         (dataset_v2["target_x"] == 1.0) &
         (dataset_v2["target_y"] == 1.0)]
positive_part_v2.loc[:, "target"] = 1.0
v2_data = pd.concat([positive_part_v2, negative_part_v2], ignore_index=True)
# drop duplicates if appear 
v2_data = v2_data.drop_duplicates(subset=["variantid1_x", "variantid1_y", "target"])
v2_data.head()

Unnamed: 0,target_x,variantid1_x,variantid2,target_y,variantid1_y,target
0,1.0,74501929,344615391,1.0,85779423,1.0
1,1.0,85779423,344615391,1.0,74501929,1.0
2,1.0,77505734,412098811,1.0,237707849,1.0
3,1.0,237707849,412098811,1.0,77505734,1.0
4,1.0,79077690,723024492,1.0,541044232,1.0


In [58]:
# Remove errors from initial datasets (A=B, B=C => A=C, but A/=D, D=C => A/=C)
drop_part_v2 = v2_data[v2_data.duplicated(subset=["variantid1_x", "variantid1_y"], keep=False)]
v2_data = v2_data.drop_duplicates(subset=["variantid1_x", "variantid1_y"], keep=False)

In [59]:
# Prepare for merging 
v2_data = v2_data[["variantid1_x", "variantid1_y", "target"]]
v2_data = v2_data.rename(columns={"variantid1_x": "variantid1",  "variantid1_y":"variantid2"})

In [60]:
v2_data = remove_symmetric_rows(v2_data)
v2_data.shape

(66662, 3)

In [61]:
drop_part_v2.head()

Unnamed: 0,target_x,variantid1_x,variantid2,target_y,variantid1_y,target
2100,1.0,750249395,777928013,1.0,760205897,1.0
2105,1.0,760205897,777928013,1.0,750249395,1.0
3384,1.0,750245645,786958564,1.0,770482416,1.0
3387,1.0,770482416,786958564,1.0,750245645,1.0
3397,1.0,750245669,780669727,1.0,770483286,1.0


## Merge data with same variant1 variant2

In [62]:
dataset_v12 = dataset.merge(dataset, how='inner', left_on='variantid1', right_on='variantid2')

In [63]:
negative_part_v12 = dataset_v12[(dataset_v12["target_x"] != dataset_v12["target_y"])]
negative_part_v12.loc[:, "target"] = 0.0
positive_part_v12 = dataset_v12[(dataset_v12['target_x'] == dataset_v12['target_y']) & (dataset_v12["target_x"] == 1.0)]
positive_part_v12.loc[:, "target"] = 1.0

In [66]:
v12_data = pd.concat([positive_part_v12, negative_part_v12], ignore_index=True)
# drop duplicates if appear 
v12_data = v12_data.drop_duplicates(subset=["variantid2_x", "variantid1_y", "target"])
v12_data.head()

Unnamed: 0,target_x,variantid1_x,variantid2_x,target_y,variantid1_y,variantid2_y,target
0,1.0,286895417,386361695,1.0,179321148,286895417,1.0
1,1.0,334181351,400540798,1.0,329453947,334181351,1.0
2,1.0,359351545,724621388,1.0,194832646,359351545,1.0
3,1.0,359351545,488419155,1.0,194832646,359351545,1.0
4,1.0,458764892,766280499,1.0,141577463,458764892,1.0


In [67]:
drop_part_v12 = v12_data[v12_data.duplicated(subset=["variantid2_x", "variantid1_y"], keep=False)]
v12_data = v12_data.drop_duplicates(subset=["variantid2_x", "variantid1_y"], keep=False)

In [68]:
# Prepare for merging 
v12_data = v12_data[["variantid2_x", "variantid1_y", "target"]]
v12_data = v12_data.rename(columns={"variantid2_x": "variantid2",  "variantid1_y":"variantid1"})

In [69]:
v12_data = remove_symmetric_rows(v12_data)
v12_data.shape

(66593, 3)

## Drop errors from initial dataset

In [88]:
drop_part_v1 = pd.concat([drop_part_v1[["variantid1", "variantid2_x"]].rename(columns={"variantid2_x": "variantid2"}) , 
                          drop_part_v1[["variantid1", "variantid2_y"]].rename(columns={"variantid2_y": "variantid2"})], ignore_index=True)

In [91]:
drop_part_v2 = pd.concat([drop_part_v2[["variantid1_x", "variantid2"]].rename(columns={"variantid1_x": "variantid1"}) , 
                          drop_part_v2[["variantid1_y", "variantid2"]].rename(columns={"variantid1_y": "variantid1"})], ignore_index=True)

In [93]:
drop_part_v12 = pd.concat([drop_part_v12[["variantid1_x", "variantid2_x"]].rename(columns={"variantid1_x": "variantid1", "variantid2_x": "variantid2"}) , 
                          drop_part_v12[["variantid1_y", "variantid2_y"]].rename(columns={"variantid1_y": "variantid1", "variantid2_y": "variantid2"})], ignore_index=True)

In [94]:
drop_part = pd.concat([drop_part_v1, drop_part_v2, drop_part_v12], ignore_index=True)

In [102]:
dataset = dataset.set_index(["variantid1", "variantid2"])
dataset = dataset.loc[~dataset.index.isin(drop_part.values.tolist())]
dataset = dataset.reset_index()

## Merge full data 

In [107]:
full_dataset = pd.concat([v1_data, v2_data, v12_data, dataset], ignore_index=True)

In [108]:
full_dataset = full_dataset.drop_duplicates()

In [112]:
full_dataset = remove_symmetric_rows(full_dataset)

In [114]:
full_dataset = full_dataset.drop_duplicates(subset=["variantid1", "variantid2"], keep=False)

In [117]:
full_dataset["target"].value_counts()

target
1.0    280702
0.0    223149
Name: count, dtype: int64

In [120]:
full_dataset.to_parquet("../data/full_dataset.parquet", index=False)