In [1]:
from nltk.tag.stanford import StanfordNERTagger
from itertools import compress
import pandas as pd
import numpy as np
import math
import re
import nltk

In [2]:
def data2features(data):
    # reorder columns dataframe
    data = data[["label","title_x","title_y","authors_x","authors_y", "venue_x", "venue_y", "year_x", "year_y"]]

    # define venue transformation
    venue_transform = {'acm sigmod record': "sigmod rec",
                       'sigmod record' :    "sigmod rec",
                       'sigmod':"sigmod rec",
                       'record' : "sigmod rec",
                       "conference" : "sigmod conf",
                       'international conference on management of data': "sigmod conf", 
                       'sigmod conference' : "sigmod conf",
                       'acm trans . database syst .': "trans",
                       "tods" : "trans",
                       'acm transactions on database systems ( tods )': "trans",
                       "transactions on database systems" : "trans",
                       'the vldb journal -- the international journal on very large data bases': "vldb",
                       'very large data bases' : "vldb",
                       'vldb': "vldb", 
                       'vldb j.' : "vldb",
                       "nan" : "NAN"}

    # normalize venue_x
    data["venue_x"] = [venue_transform[str(x)] for x in data["venue_x"]]
    data["venue_y"] = [venue_transform[str(x)] for x in data["venue_y"]]


    # Extract venue from title
    ## X
    long_venue = ['acm sigmod record', 'acm trans . database syst .',
           'acm transactions on database systems ( tods )',
            "transactions on database systems",
           'international conference on management of data',
           'sigmod conference', 'sigmod record',
           'the vldb journal -- the international journal on very large data bases',
           'very large data bases',]
    short_venue = ["tods",'vldb', 'record','conference', "sigmod"]

    for venue in long_venue:
        long_idx = data.title_x.str.find(venue)
        data.title_x = data.title_x.str.replace(venue, "")
        data.loc[long_idx>0, "venue_x"] = [venue_transform[venue]]*sum(long_idx>0)

    for venue in short_venue:
        long_idx = data.title_x.str.find(venue)
        data.title_x = data.title_x.str.replace(venue, "")
        print(sum((long_idx>0) & (data["venue_x"]=="NAN")))

        data.loc[(long_idx>0) & (data["venue_x"]=="NAN"), "venue_x"] = [venue_transform[venue]]*sum((long_idx>0) & (data
                                                                                                                    ["venue_x"]=="NAN"))

    ## Y
    for venue in long_venue:
        long_idx = data.title_y.str.find(venue)
        data.title_y = data.title_y.str.replace(venue, "")
        data.loc[long_idx>0, "venue_y"] = [venue_transform[venue]]*sum(long_idx>0)

    for venue in short_venue:
        long_idx = data.title_y.str.find(venue)
        data.title_y = data.title_y.str.replace(venue, "")
        print(sum((long_idx>0) & (data["venue_y"]=="NAN")))
        data.loc[(long_idx>0) & (data["venue_y"]=="NAN"), "venue_y"] = [venue_transform[venue]]*sum((long_idx>0) & (data["venue_y"]=="NAN"))

    def extract_year(string_list):
        return [re.findall('(\d{4})', text) for text in string_list]

    def extract_names(string_list):
        st = StanfordNERTagger('stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
                           'stanford-ner/stanford-ner.jar')
        all_names = []
        for text in string_list:
            names = []
            for sent in nltk.sent_tokenize(text):
                tokens = nltk.tokenize.word_tokenize(sent)
                tags = st.tag(tokens)
                for tag in tags:
                    if tag[1] in ["PERSON"]:
                        names.append(tag[0])
            all_names.append(names)
        return all_names

    # extract year from title
    ## X
    year_from_title = extract_year(data.title_x)
    year_from_title_idx = [len(year) for year in year_from_title]
    data.loc[year_from_title_idx & (pd.isnull(data.year_x)), "year_x"] = [x[0] for x in compress(year_from_title, year_from_title_idx& (pd.isnull(data.year_x)))]
    data.title_x = [re.sub('(\d{4})',"", text) for text in data.title_x]
    
    ## Y
    year_from_title = extract_year(data.title_y)
    year_from_title_idx = [len(year) for year in year_from_title]
    data.loc[year_from_title_idx & (pd.isnull(data.year_y)), "year_y"] = [x[0] for x in compress(year_from_title, year_from_title_idx & (pd.isnull(data.year_y)))]
    data.title_y = [re.sub('(\d{4})',"", text) for text in data.title_y]

    # cut the longer title to fit the lenght of the shorter
    for i in range(data.shape[0]):
        row = data.iloc[i]
        title_x_len = len(row.title_x)
        title_y_len = len(row.title_y)
        min_title_len = min(title_x_len,title_y_len)
        if title_x_len<title_y_len:
            sub_title_y = row.title_y[:min_title_len]
            authors_y = row.title_y[min_title_len:]
            if not pd.isnull(row.authors_y):
                authors_y += row.authors_y
            sub_title_x = row.title_x
            authors_x = row.authors_x
        elif title_x_len>title_y_len:
            sub_title_x = row.title_x[:min_title_len]
            authors_x = row.title_x[min_title_len:]
            if not pd.isnull(row.authors_x):
                authors_x += row.authors_x
            sub_title_y = row.title_y
            authors_y = row.authors_y
        else:
            sub_title_x = row.title_x
            sub_title_y = row.title_y
            authors_y = row.authors_y
            authors_x = row.authors_x

        data.at[i,"title_x"] = sub_title_x
        data.at[i,"title_y"] = sub_title_y
        data.at[i,"authors_x"] = authors_x
        data.at[i,"authors_y"] = authors_y

    #------------------------
    # Create features
    #--------------------------
    
    # create equal year and neq year feature
    year_diff = np.abs(data.year_x.astype(float) - data.year_y.astype(float))
    eq_yaer  = (year_diff == 0 )
    neq_year   = (year_diff > 0)

    # is venue of X equal venue of Y?
    venue_diff = ( data.venue_x == data.venue_y )

    # string distanse for Title and authors
    from hermetrics.jaccard import Jaccard
    from hermetrics.damerau_levenshtein import DamerauLevenshtein
    from hermetrics.jaro_winkler import JaroWinkler
    from strsimpy.qgram import QGram

    jaw = JaroWinkler()
    dam = DamerauLevenshtein()
    jac = Jaccard()
    qgram = QGram(3)

    Jaro_title = [jaw.distance(str(x), str(y)) for x,y in zip(data.title_x, data.title_y)]
    Damerau_title = [dam.normalized_distance(str(x), str(y)) for x,y in zip(data.title_x, data.title_y)]
    Jaccard_title = [jac.distance(str(x), str(y)) for x,y in zip(data.title_x, data.title_y)]
    Qgram_title = [qgram.distance(str(x), str(y))/(max(len(str(x)), len(str(y)))-2) for x,y in zip(data.title_x, data.title_y)]

    Jaro_authors = [jaw.distance(str(x), str(y)) for x,y in zip(data.authors_x, data.authors_y)]
    Damerau_authors = [dam.normalized_distance(str(x), str(y)) for x,y in zip(data.authors_x, data.authors_y)]
    Jaccard_authors = [jac.distance(str(x), str(y)) for x,y in zip(data.authors_x, data.authors_y)]
    Qgram_authors = [qgram.distance(str(x), str(y))/(max(len(str(x)), len(str(y)))-2) for x,y in zip(data.authors_x, data.authors_y)]

    # create the feature matrix
    data_feature = {"Jaro_title":Jaro_title,
                    "Damerau_title":Damerau_title,
                    "Jaccard_title": Jaccard_title,
                    "Qgram_title": Qgram_title,
                    "Jaro_authors": Jaro_authors,
                    "Damerau_authors": Damerau_authors,
                    "Jaccard_authors": Jaccard_authors,
                    "Qgram_authors": Qgram_authors,
                    "eq_yaer": eq_yaer,
                    "neq_year": neq_year,
                    "venue_diff": venue_diff}
    data_features = pd.DataFrame(data_feature)
    return data_features

In [None]:
tableA_df = pd.read_csv("data/tableA.csv")
tableB_df = pd.read_csv("data/tableB.csv")
train_df = pd.read_csv("data/train.csv")
train_df["order"] = list(range(train_df.shape[0]))
validation_df = pd.read_csv("data/valid.csv")
validation_df["order"] = list(range(validation_df.shape[0]))
# merge train.val df with tableA and tableB
train_df_merged = train_df.merge(tableA_df,left_on="ltable_id", right_on="id", sort=False).merge(tableB_df,left_on="rtable_id", right_on="id", sort=False)
train_df_merged = train_df_merged.sort_values("order").reset_index()
train_df_merged = train_df_merged.drop("order", axis=1)

validation_df_merged = validation_df.merge(tableA_df,left_on="ltable_id", right_on="id", sort=False).merge(tableB_df,left_on="rtable_id", right_on="id", sort=False)
validation_df_merged = validation_df_merged.sort_values("order").reset_index()
validation_df_merged = validation_df_merged.drop("order", axis=1)

train_df_features = data2features(train_df_merged)
validation_df_features = data2features(validation_df_merged)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["venue_x"] = [venue_transform[str(x)] for x in data["venue_x"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["venue_y"] = [venue_transform[str(x)] for x in data["venue_y"]]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice

0
1559
0
0
0
0
0
0
0
0


## Cross-validation 

In [63]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import plotly.graph_objects as go
from plotly.subplots import make_subplots
n_splits = 10
cv = StratifiedKFold(n_splits=n_splits).split(train_df_features, train_df.label)
threshold = 0.7

results_train_acc = []
results_train_f1 = []
results_test_acc = []
results_test_f1 = []

for train_id, test_idx in cv:
    y_true_train = train_df.label[train_id]
    y_true_test = train_df.label[test_idx]
    
    model = LogisticRegression().fit(X = train_df_features.iloc[train_id], y= train_df.label[train_id])
    y_pred_train = model.predict(train_df_features.iloc[train_id])
    y_pred_test =  model.predict(train_df_features.iloc[test_idx])
    
    results_train_acc.append(accuracy_score(y_true_train, y_pred_train))
    results_train_f1.append(f1_score(y_true_train, y_pred_train))
    results_test_acc.append(accuracy_score(y_true_test, y_pred_test))
    results_test_f1.append(f1_score(y_true_test, y_pred_test))

x = np.arange(n_splits)

fig = make_subplots(rows=1, cols=2, shared_yaxes=True,
                    subplot_titles=['Accuracy', 'F1 Score'])

fig.add_trace(
    go.Scatter(
        x=x,
        y=results_train_acc,
        mode='lines',
        name='Accuracy train',
        marker=dict(
            color='blue',
            line=dict(
                color='blue'
            )
        )
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Scatter(
        x=x,
        y=results_test_acc,
        mode='lines+markers',
        name='Accuracy test',
        marker=dict(
            color='red',
            line=dict(
                color='red'
            )
        )
    ),
    row=1,
    col=1
)

fig.add_trace(
    go.Scatter(
        x=x,
        y=results_train_f1,
        mode='lines',
        name='F1 train',
        marker=dict(
            color='blue',
            line=dict(
                color='blue'
            )
        )
    ),
    row=1,
    col=2
)

fig.add_trace(
    go.Scatter(
        x=x,
        y=results_test_f1,
        mode='lines+markers',
        name='F1 test',
        marker=dict(
            color='red',
            line=dict(
                color='red'
            )
        )
    ),
    row=1,
    col=2
)

fig.update_layout(yaxis_range=[0, 1])

fig.write_html("results.html")

In [64]:
np.mean(results_test_acc)

0.9917757019544509

# Save predictions

In [66]:
model = LogisticRegression(random_state=0).fit(train_df_features, train_df.label)
print(accuracy_score(model.predict(train_df_features), train_df.label))
VALIDATION_PRED = model.predict(validation_df_features)
validation_df.label = VALIDATION_PRED
validation_df.to_csv("LogReg_final_pred.csv")

0.9919104759336659


## AutoML

In [67]:
from tpot import TPOTClassifier

tpot = TPOTClassifier(cv=10, generations=20, population_size=50, verbosity=2, random_state=42, n_jobs=-1)

In [68]:
tpot.fit(train_df_features, train_df.label)

Version 0.11.6 of tpot is outdated. Version 0.11.7 was released Wednesday January 06, 2021.


HBox(children=(HTML(value='Optimization Progress'), FloatProgress(value=0.0, max=1050.0), HTML(value='')))


Generation 1 - Current best internal CV score: 0.9924504657871094

Generation 2 - Current best internal CV score: 0.9927196438119974

Generation 3 - Current best internal CV score: 0.9927196438119974

Generation 4 - Current best internal CV score: 0.9927196438119974

Generation 5 - Current best internal CV score: 0.9927196438119974

Generation 6 - Current best internal CV score: 0.9928549603326168

Generation 7 - Current best internal CV score: 0.9928549603326168

Generation 8 - Current best internal CV score: 0.9928549603326168

Generation 9 - Current best internal CV score: 0.9929895493450609

Generation 10 - Current best internal CV score: 0.993123956480461

Generation 11 - Current best internal CV score: 0.993123956480461

Generation 12 - Current best internal CV score: 0.9932589092469927

Generation 13 - Current best internal CV score: 0.9932589092469927

Generation 14 - Current best internal CV score: 0.9932589092469927

Generation 15 - Current best internal CV score: 0.99325890

TPOTClassifier(cv=10, generations=20, n_jobs=-1, population_size=50,
               random_state=42, verbosity=2)

# Save predictions

In [71]:
print(accuracy_score(train_df.label, tpot.predict(train_df_features)))
VALIDATION_PRED = tpot.predict(validation_df_features)
validation_df.label = VALIDATION_PRED
validation_df.to_csv("TPoT_final_pred.csv")

0.9987865713900499
