# Etude 

Ce notebook a pour objectif de tester les perfomances de pandas sur un dataset de questions posées sur quora. 
Le but est de dédupliquer les questions.



In [1]:
import pandas as pd
import numpy as np
import time
import nltk

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline

## 1. Importation des données

Pour tester les perfomances de pandas à la lecture de données nous allons simplement charger le dataframe en memoire


In [3]:
%%time
df_full = pd.read_csv('./data/train.csv')

CPU times: user 1.03 s, sys: 125 ms, total: 1.15 s
Wall time: 1.15 s


In [32]:
df_full.shape

(404290, 6)

In [4]:
df_dict = {
    'full': df_full,
    '1000': df_full[:1000].copy(),
    '10000': df_full[:10000].copy(),
    '100000': df_full[:100000].copy(),
}

## 2. Séparation des données en un ensemble d'apprentissage et un ensemble de validation.


In [5]:
from sklearn.model_selection import train_test_split
taux_sep = 0.7
def split_dataset(df, taux_sep):
    return train_test_split(df, train_size=taux_sep)

## 3. Nettoyage des données

Pour le nettoyage des données nous allons:
- supprimer les questions vides
- retirer les stopwords
- passer le text en minuscule
- tokenizer les questions

In [6]:
from nltk.corpus import stopwords


english_stopwords = set(stopwords.words("english"))

def clean_string(string: str) -> str:
    """
    Remove stopwords and stem the string
    """
    if isinstance(string, str):
        string = string.lower()
        words = []
        for word in string.split():
            if word not in english_stopwords:
                words.append(word)
        return words

def clean_dataframe(df):
    # Questions en minuscule
    for column in ['question1', 'question2']:
        df[column] = df[column].apply(clean_string)
    # Suppression des NaN
    df = df.dropna()
    return df

In [7]:
cleaning_time = {}
df_cleaned = {}
for key, df in df_dict.items():
    st = time.time()
    df_cleaned[key] = clean_dataframe(df)
    cleaning_time[key] = time.time() - st

In [31]:
cleaning_time

{'full': 3.7794349193573,
 '1000': 0.00979471206665039,
 '10000': 0.07298994064331055,
 '100000': 0.9923090934753418}

## 3.  Représentation des données.

Pour la representation des données nous avons:
- Compter le nombre de mots communs entre les questions
- Compter le nombre de mots communs entre les questions pondérés par l'idf

In [8]:
from collections import defaultdict

In [9]:
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

In [10]:
def compute_idf(df):
    questions = df[['qid1','question1']].rename(columns={'qid1': 'qid','question1':'question'})\
    .append(df[['qid2','question2']].rename(columns={'qid2': 'qid','question2':'question'})).drop_duplicates('qid')\
    .question.values

    counts = defaultdict(lambda : 0)

    for question in questions:
        if question:
            for word in question:
                counts[word] += 1

    idf = {word: get_weight(count) for word, count in counts.items()}

    return idf



In [11]:
idf_time = {}
idf_dict = {}
for key, df in df_dict.items():
    st = time.time()
    idf_dict[key] = compute_idf(df)
    idf_time[key] = time.time() - st

In [12]:
idf_time

{'full': 1.245344877243042,
 '1000': 0.007905006408691406,
 '10000': 0.04187798500061035,
 '100000': 0.33655309677124023}

In [13]:
from nltk.corpus import stopwords

def common_words(df):
    def common_words(row):
        question1 = row['question1']
        question2 = row['question2']
        common_words = 0
        number_of_words = len(question1) + len(question2)
        for word in question1:
            if word in question2:
                common_words += 1
        return 2 * common_words / number_of_words
    df['common_words'] = df.apply(common_words, axis='columns')
    return df

In [14]:
df_common_words = {}
common_words_time = {}
for key, df in df_cleaned.items():
    st = time.time()
    df_common_words[key] = common_words(df)
    common_words_time[key] = time.time() - st

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [15]:
common_words_time

{'full': 14.060517072677612,
 '1000': 0.03775477409362793,
 '10000': 0.34238195419311523,
 '100000': 3.386591911315918}

In [16]:
def common_words_idf_weighted(df, idf):
    def common_words_idf_weighted(row):
        question1 = row['question1']
        question2 = row['question2']
        common_words_weighted = 0
        questions_weights = 0
        for word in question1:
            idf_weight = idf[word]
            if word in question2:
                common_words_weighted += idf_weight
            questions_weights += idf_weight
        for word in question2:
            questions_weights += idf[word]
        if questions_weights > 0:
            return 2 * common_words_weighted / questions_weights
        else:
            return 0
    df['common_words_idf_weighted'] = df.apply(common_words_idf_weighted, axis='columns')
    return df

In [17]:
df_common_words_idf = {}
common_words_idf_time = {}
for key, df in df_common_words.items():
    idf = idf_dict[key]
    st = time.time()
    df_common_words_idf[key] = common_words_idf_weighted(df, idf)
    common_words_idf_time[key] = time.time() - st

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
common_words_idf_time

{'full': 14.994139909744263,
 '1000': 0.03665304183959961,
 '10000': 0.3632466793060303,
 '100000': 3.593071937561035}

###### Processing pour les models

In [19]:
train_dict = {}
test_dict = {}
x_train_dict = {}
x_test_dict = {}
y_train_dict = {}
y_test_dict = {}

selected_columns = ['common_words', 'common_words_idf_weighted']

for key, df in df_common_words_idf.items():
    train, test = split_dataset(df, taux_sep)
    train_dict[key] = train.dropna()
    test_dict[key] = test.dropna()

    x_train_dict[key] = train_dict[key][selected_columns]

    x_test_dict[key] = test_dict[key][selected_columns]

    y_train_dict[key] = train_dict[key].is_duplicate
    y_test_dict[key] = test_dict[key].is_duplicate



## 4.  Apprentissage et performance

Pour tester les perfomances des modeles nous allons entrainer une regression logistic, un random forest et un arbre de decision

In [20]:
from sklearn.metrics import accuracy_score

### Regression Logistique 

In [21]:
from sklearn.linear_model import LogisticRegression



In [22]:
lr_dict = {}

lr_time = {}

for key in train_dict:
    st = time.time()
    lr_dict[key] = LogisticRegression() 
    lr_dict[key].fit(x_train_dict[key], y_train_dict[key])
    lr_time[key] = time.time() - st



In [23]:
lr_time

{'full': 0.29720115661621094,
 '1000': 0.0024499893188476562,
 '10000': 0.00648808479309082,
 '100000': 0.061994075775146484}

In [24]:
for key in train_dict:
    y_pred = lr_dict[key].predict(x_test_dict[key])
    print(key, accuracy_score(y_pred, y_test_dict[key]))

full 0.6707643853009804
1000 0.7133333333333334
10000 0.6756666666666666
100000 0.6802333333333334


### Arbres de décision

In [25]:
from sklearn import tree


dt_dict = {}

dt_time = {}

for key in train_dict:
    st = time.time()
    dt_dict[key] = tree.DecisionTreeClassifier()
    dt_dict[key].fit(x_train_dict[key], y_train_dict[key])
    dt_time[key] = time.time() - st

In [26]:
dt_time

{'full': 1.061352014541626,
 '1000': 0.0016529560089111328,
 '10000': 0.012163639068603516,
 '100000': 0.19474315643310547}

In [27]:
for key in train_dict:
    y_pred = dt_dict[key].predict(x_test_dict[key])
    print(key, accuracy_score(y_pred, y_test_dict[key]))

full 0.7066627091114464
1000 0.6733333333333333
10000 0.688
100000 0.7007666666666666


### Random Forests


In [28]:
from sklearn.ensemble import RandomForestClassifier

rf_dict = {}

rf_time = {}

for key in train_dict:
    st = time.time()
    rf_dict[key] = RandomForestClassifier()
    rf_dict[key].fit(x_train_dict[key], y_train_dict[key])
    rf_time[key] = time.time() - st



In [29]:
rf_time

{'full': 5.3011391162872314,
 '1000': 0.011728763580322266,
 '10000': 0.05720186233520508,
 '100000': 0.835341215133667}

In [30]:
for key in train_dict:
    y_pred = rf_dict[key].predict(x_test_dict[key])
    print(key, accuracy_score(y_pred, y_test_dict[key]))

full 0.7080313636251205
1000 0.6966666666666667
10000 0.6876666666666666
100000 0.7030333333333333
