In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re

In [2]:
ldata_train = 5000
df_train = pd.read_csv('input/train.csv')
df_train_orig = df_train[:ldata_train]
df_train = df_train[:ldata_train]

# 1-Prépocessing



In [3]:
#Remplacement de toutes les balises math par un string
df_train['question1'] = [re.sub(r"\[math\].*?\[/math\]", "math", question) for question in df_train['question1']]
df_train['question2'] = [re.sub(r"\[math\].*?\[/math\]", "math", question) for question in df_train['question2']]

In [4]:
#Suppression de tous les charactères pas latin
df_train['question1'] = [re.sub(r"[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]", u"", question) for question in df_train['question1']]
df_train['question2'] = [re.sub(r"[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]", u"", question) for question in df_train['question2']]

In [5]:
#Suppression de tous les signes de ponctuation et mise en minuscule de tous les caractères
df_train['question1'] = [re.sub(r'[^\w\s]','',question.lower()) for question in df_train['question1']]
df_train['question2'] = [re.sub(r'[^\w\s]','',question.lower()) for question in df_train['question2']]

In [6]:
#Tokenisation de chaque mot
df_train['question1'] = [nltk.word_tokenize(question) for question in df_train['question1']]
df_train['question2'] = [nltk.word_tokenize(question) for question in df_train['question2']]
df_train_for_pos_features = df_train.copy(deep=True)

In [7]:
#Suppression des stop words
df_train['question1'] = [([word for word in question if word not in stopwords.words('english')]) for question in df_train['question1']]
df_train['question2'] = [([word for word in question if word not in stopwords.words('english')]) for question in df_train['question2']]

# 2-Construction des features

In [8]:
#Fonction renvoyant le nombre de token en commun entre la question 1 et 2 d'une ligne
def word_match_share(row):
    q1words = {}
    q2words = {}
    for word in row['question1']:
        q1words[word] = 1
    for word in row['question2']:
        q2words[word] = 1
    if len(q1words) == 0 or len(q2words) == 0:
        return 0
    shared_words_in_q1 = [w for w in q1words.keys() if w in q2words]
    shared_words_in_q2 = [w for w in q2words.keys() if w in q1words]
    R = (len(shared_words_in_q1) + len(shared_words_in_q2))/(len(q1words) + len(q2words))
    return R

In [9]:
#Application de la fonction word_match_share à notre dataset
train_word_match = df_train.apply(word_match_share, axis=1, raw=True)

In [10]:
#Application de la fonction word_match_share à notre dataset ayant subit un steaming
porter = nltk.PorterStemmer()
df_train['question1'] = [([porter.stem(word) for word in question]) for question in df_train['question1']]
df_train['question2'] = [([porter.stem(word) for word in question]) for question in df_train['question2']]
train_word_match_stemmed = df_train.apply(word_match_share, axis=1, raw=True)

In [11]:
#Fonction renvoyant la différence de longueur entre la question 1 et 2 d'une ligne
def sentence_length_diff(row):
    return abs(len(row['question1']) - len (row['question2']))

In [12]:
#Application de la fonction sentence_length_diff à notre dataset original avant préprocessing
train_diff_length = df_train_orig.apply(sentence_length_diff, axis=1, raw=True)

In [13]:
#Fonction renvoyant le nombre de POS tags en commun entre la question 1 et 2 d'une ligne
def postags_share(row):
    q1tags = {}
    q2tags = {}
    pos1 = nltk.pos_tag(row['question1'])
    pos2 = nltk.pos_tag(row['question2'])
    for word in pos1:
        q1tags[word[1]] = 1
    for word in pos2:
        q2tags[word[1]] = 1
    if len(q1tags) == 0 or len(q2tags) == 0:
        return 0
    shared_tags_in_q1 = [w for w in q1tags.keys() if w in q2tags]
    shared_tags_in_q2 = [w for w in q2tags.keys() if w in q1tags]
    R = (len(shared_tags_in_q1) + len(shared_tags_in_q2))/(len(q1tags) + len(q2tags))
    return R

In [None]:
train_pos_match = df_train_for_pos_features.apply(postags_share, axis=1, raw=True)

In [None]:
#creation de notre vecteur d'entrée
x_train = pd.DataFrame()
x_train['word_match'] = train_word_match
x_train['word_match_stemmed'] = train_word_match_stemmed
x_train['diff_length'] = train_diff_length
x_train['pos_match'] = train_pos_match
y_train = df_train['is_duplicate'].values

In [None]:
from sklearn.cross_validation import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=4242)

# 3-Construction du modèle

In [None]:
from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
grd = GradientBoostingClassifier()
grd.fit(x_train, y_train)

In [None]:
from sklearn.metrics import zero_one_loss
zero_one_loss(y_valid, grd.predict(x_valid), sample_weight=None)