In [1]:
seed = 123

In [2]:
import os
import pandas as pd
import numpy as np
import re 
import seaborn as sns
sns.set()

import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

def load_data(dataset, DATA_DIR='./data/'):
    read_args = {
        'node_information':{
            'names': ['id', 'publication_date', 'title', 'authors', 'journal', 'abstract'],
            'index_col': 'id'
        },
        'train': {
            'names': ['source_id', 'target_id','category'],
            'sep': ' ',
        },
        'test':{
            'names':['source_id', 'target_id'],
            'sep': ' ',
        }
    }
    filenames = {
        'node_information': 'node_information.csv',
        'train': 'training_set.txt',
        'test': 'testing_set.txt'
    }
    if dataset in read_args:
        file_path = os.path.join(DATA_DIR, filenames[dataset])
        return pd.read_csv(file_path, header=None, **read_args[dataset])

In [161]:
DATA_DIR='../data/'

In [11]:
def merge_node_information(dataset, node_information):
    for column in node_information.columns:
        for origin in ['source', 'target']:
            serie = node_information[column].rename(f'{origin}_{column}').to_frame()
            dataset = dataset.merge(serie, left_on=f'{origin}_id', right_index=True)
    return dataset

In [12]:
train = load_data('train')
node_information = load_data('node_information')
train = merge_node_information(train, node_information)

In [7]:
stem.stem('he has')

'he ha'

In [31]:
stem = PorterStemmer()
english_stopwords = set(stopwords.words('english'))
def clean_string(string):
    string = stem.stem(string)
    words = []
    for word in string.split():
        if word not in english_stopwords:
            words.append(word)
    return ' '.join(words)

In [23]:
english_stopwords = set(stopwords.words('english'))
stem = PorterStemmer()

def tokenize(string):
    string = stem.stem(string)
    words = []
    for word in string.split():
        if word not in english_stopwords:
            words.append(word)
    return words

string = 'an infinite number of potentials surrounding 2d black hole'
tokenize(string)

def overlap(string1, string2):
    return len(set(string1.split()).intersection(set(string1.split())))

In [11]:
def extract_authors(authors):
    stopwords = ['alex', 't', 'latex', 'pages', 'jr', 's']
    if isinstance(authors, str):
        
        # Remove string between parenthesis
        string = re.sub(r'\([^\)]+[\)$]','',authors)
        # Remove bad parenthesis
        string = re.sub(r'\([^\)]*$','', string)
        # Remove multiple spaces
        string = re.sub(r'\s', '', string)
        # Extract authors and sanityze strings
        
        authors = []
        for author in string.split(','):
            author = author.lower()
            author = re.sub(r'[\W\d]','', author).lower()
            if len(author) >= 2 and not author in stopwords:
                authors.append(author)
        return authors

def common_authors(authors1, authors2):
    if isinstance(authors1, str) and isinstance(authors2, str):
        return len(set(extract_authors(authors1)).intersection(set(extract_authors(authors2))))
    

In [128]:
def extract_features(dataset):
    train['source_title'] = dataset.source_title.apply(lambda x : clean_string(x))
    train['target_title'] = dataset.source_title.apply(lambda x : clean_string(x))
    dataset['title_overlap'] = dataset.apply(lambda x : len(set(x.source_title.split()).intersection(set(x.target_title.split()))), axis='columns')

    dataset['common_authors'] = dataset.apply(lambda x: common_authors(x.source_authors, x.target_authors), axis='columns')
    dataset['publication_date_diff'] = dataset.source_publication_date - dataset.target_publication_date
    
    selected_columns = ['title_overlap', 'common_authors', 'publication_date_diff']
    x = dataset[selected_columns].fillna(0).values
    return x

In [43]:
for column in selected_columns:
    print(train[column].isna().value_counts())

False    615512
Name: title_overlap, dtype: int64
False    453316
True     162196
Name: common_authors, dtype: int64
False    615512
Name: publication_date_diff, dtype: int64


In [130]:
x = extract_features(train)

In [131]:
y = train.category.values

In [132]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, accuracy_score

In [133]:
y_pred = np.zeros(y.shape)

In [158]:
f1_list = []
accuracy_list = []

from sklearn.model_selection import KFold
kf = KFold(n_splits=5, random_state=seed, shuffle=True)

i = 1
for train_index, test_index in kf.split(x):

    X_train, X_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    random_forest = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=seed)
    random_forest.fit(X_train, y_train)
    
    y_pred[test_index] = random_forest.predict(X_test)
    
    f1_list.append(f1_score(y_test, y_pred[test_index]))
    accuracy_list.append(accuracy_score(y_test, y_pred[test_index]))
    print(f'Epoch {i}')
    print(f'f1: {f1_list[i-1]}')
    print(f'accuracy :{f1_list[i-1]}')
    i+=1

Epoch 1
f1: 0.8101044024238885
accuracy :0.8101044024238885
Epoch 2
f1: 0.8117340938313131
accuracy :0.8117340938313131
Epoch 3
f1: 0.8108351814393293
accuracy :0.8108351814393293
Epoch 4
f1: 0.8119430241051863
accuracy :0.8119430241051863
Epoch 5
f1: 0.8124730372647787
accuracy :0.8124730372647787


In [135]:
print(f'Mean f1 {np.mean(f1_list)}')
print(f'Mean accuracy {np.mean(accuracy_list)}')


Mean f1 0.8114179478128992
Mean accuracy 0.7483200996124862


In [136]:
random_forest = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=seed)
random_forest.fit(x, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=123, verbose=0, warm_start=False)

In [None]:
test = load_data('test')

In [140]:
test = merge_node_information(test, node_information)

In [141]:
x_submisson = extract_features(test)

In [164]:
y_submission = random_forest.predict(x_submisson)

In [150]:
y_submission = pd.Series(data=y_submission)

In [152]:
y_submission = y_submission.rename('category')

y_submission = y_submission.rename_axis('id', axis='index')

y_submission.to_csv(os.path.join(DATA_DIR,'test_submission_random_forest.csv'), header=True)

In [168]:
#zgenerate_submission(y_submission, 'random_forest_dummy_features.csv')

0.72041 sur le leadboard vs 0.8114179478128992 with cross validation