# Etude 

Ce notebook a pour objectif de tester les perfomances de pandas sur un dataset de questions posées sur quora. 
Le but est de dédupliquer les questions.



In [2]:
import pandas as pd
import numpy as np
import time
import nltk

In [3]:
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
DATA_PATH = './data/train.csv'

In [6]:
10**5.25

177827.94100389228

## 1. Importation des données

Pour tester les perfomances de pandas à la lecture de données nous allons simplement charger le dataframe en memoire


In [5]:
%%time
df_full = pd.read_csv(DATA_PATH)

CPU times: user 1.01 s, sys: 139 ms, total: 1.15 s
Wall time: 1.16 s


In [4]:
df_full.shape

(404290, 6)

In [10]:
df_dict = {
    1000: df_full[:1000].copy(),
    10000: df_full[:10000].copy(),
    100000: df_full[:100000].copy(),
    400000: df_full,
    800000: df_full.append(df_full),
    1200000: df_full.append(df_full).append(df_full)
}

## 2. Séparation des données en un ensemble d'apprentissage et un ensemble de validation.


In [6]:
from sklearn.model_selection import train_test_split
taux_sep = 0.7
def split_dataset(df, taux_sep):
    return train_test_split(df, train_size=taux_sep)

## 3. Nettoyage des données

Pour le nettoyage des données nous allons:
- supprimer les questions vides
- retirer les stopwords
- passer le text en minuscule
- tokenizer les questions

In [16]:
from nltk.corpus import stopwords


english_stopwords = set(stopwords.words("english"))

def clean_string(string: str) -> str:
    """
    Remove stopwords and stem the string
    """
    if isinstance(string, str):
        string = string.lower()
        words = []
        for word in string.split():
            if word not in english_stopwords:
                words.append(word)
        return words

def clean_dataframe(df):
    # Questions en minuscule
    for column in ['question1', 'question2']:
        df[column] = df[column].apply(clean_string)
    # Suppression des NaN
    df = df.dropna()
    return df

In [8]:
cleaning_time = {}
df_cleaned = {}
for key, df in df_dict.items():
    st = time.time()
    df_cleaned[key] = clean_dataframe(df)
    cleaning_time[key] = time.time() - st

In [9]:
cleaning_time

{400000: 3.833127975463867,
 1000: 0.01153707504272461,
 10000: 0.08499813079833984,
 100000: 1.0665977001190186,
 800000: 8.105329990386963,
 1200000: 11.493905782699585}

## 3.  Représentation des données.

Pour la representation des données nous avons:
- Compter le nombre de mots communs entre les questions
- Compter le nombre de mots communs entre les questions pondérés par l'idf

In [11]:
from collections import defaultdict

In [12]:
def get_weight(count, eps=10000, min_count=2):
    if count < min_count:
        return 0
    else:
        return 1 / (count + eps)

In [13]:
def compute_idf(df):
    questions = df[['qid1','question1']].rename(columns={'qid1': 'qid','question1':'question'})\
    .append(df[['qid2','question2']].rename(columns={'qid2': 'qid','question2':'question'})).drop_duplicates('qid')\
    .question.values

    counts = defaultdict(lambda : 0)

    for question in questions:
        if question:
            for word in question:
                counts[word] += 1

    idf = {word: get_weight(count) for word, count in counts.items()}

    return idf



In [14]:
idf_time = {}
idf_dict = {}
for key, df in df_dict.items():
    st = time.time()
    idf_dict[key] = compute_idf(df)
    idf_time[key] = time.time() - st

In [15]:
idf_time

{1000: 0.015150070190429688,
 10000: 0.04743003845214844,
 100000: 0.3630099296569824,
 400000: 1.2766520977020264,
 800000: 1.420454740524292,
 1200000: 1.794395923614502}

In [17]:
from nltk.corpus import stopwords

def common_words(df):
    def common_words(row):
        question1 = row['question1']
        question2 = row['question2']
        common_words = 0
        number_of_words = len(question1) + len(question2)
        for word in question1:
            if word in question2:
                common_words += 1
        return 2 * common_words / number_of_words
    df['common_words'] = df.apply(common_words, axis='columns')
    return df

In [18]:
df_common_words = {}
common_words_time = {}
for key, df in df_cleaned.items():
    st = time.time()
    df_common_words[key] = common_words(df)
    common_words_time[key] = time.time() - st

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]


In [19]:
common_words_time

{400000: 15.429856061935425,
 1000: 0.035256147384643555,
 10000: 0.351970911026001,
 100000: 3.493546962738037,
 800000: 30.880029916763306,
 1200000: 50.30730128288269}

In [20]:
def common_words_idf_weighted(df, idf):
    def common_words_idf_weighted(row):
        question1 = row['question1']
        question2 = row['question2']
        common_words_weighted = 0
        questions_weights = 0
        for word in question1:
            idf_weight = idf[word]
            if word in question2:
                common_words_weighted += idf_weight
            questions_weights += idf_weight
        for word in question2:
            questions_weights += idf[word]
        if questions_weights > 0:
            return 2 * common_words_weighted / questions_weights
        else:
            return 0
    df['common_words_idf_weighted'] = df.apply(common_words_idf_weighted, axis='columns')
    return df

In [21]:
df_common_words_idf = {}
common_words_idf_time = {}
for key, df in df_common_words.items():
    idf = idf_dict[key]
    st = time.time()
    df_common_words_idf[key] = common_words_idf_weighted(df, idf)
    common_words_idf_time[key] = time.time() - st

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
common_words_idf_time

{400000: 25.642945051193237,
 1000: 0.052175045013427734,
 10000: 0.43926310539245605,
 100000: 3.947237014770508,
 800000: 32.78772020339966,
 1200000: 49.21293520927429}

###### Processing pour les models

In [23]:
train_dict = {}
test_dict = {}
x_train_dict = {}
x_test_dict = {}
y_train_dict = {}
y_test_dict = {}

selected_columns = ['common_words', 'common_words_idf_weighted']

for key, df in df_common_words_idf.items():
    train, test = split_dataset(df, taux_sep)
    train_dict[key] = train.dropna()
    test_dict[key] = test.dropna()

    x_train_dict[key] = train_dict[key][selected_columns]

    x_test_dict[key] = test_dict[key][selected_columns]

    y_train_dict[key] = train_dict[key].is_duplicate
    y_test_dict[key] = test_dict[key].is_duplicate



## 4.  Apprentissage et performance

Pour tester les perfomances des modeles nous allons entrainer une regression logistic, un random forest et un arbre de decision

In [24]:
from sklearn.metrics import accuracy_score

### Regression Logistique 

In [25]:
from sklearn.linear_model import LogisticRegression

lr_dict = {}
lr_time = {}

for key in train_dict:
    st = time.time()
    lr_dict[key] = LogisticRegression() 
    lr_dict[key].fit(x_train_dict[key], y_train_dict[key])
    lr_time[key] = time.time() - st



In [26]:
lr_time

{400000: 0.33588671684265137,
 1000: 0.0021741390228271484,
 10000: 0.006098031997680664,
 100000: 0.06316709518432617,
 800000: 0.61788010597229,
 1200000: 0.9612510204315186}

In [24]:
for key in train_dict:
    y_pred = lr_dict[key].predict(x_test_dict[key])
    print(key, accuracy_score(y_pred, y_test_dict[key]))

full 0.6707643853009804
1000 0.7133333333333334
10000 0.6756666666666666
100000 0.6802333333333334


### Arbres de décision

In [27]:
from sklearn import tree


dt_dict = {}

dt_time = {}

for key in train_dict:
    st = time.time()
    dt_dict[key] = tree.DecisionTreeClassifier()
    dt_dict[key].fit(x_train_dict[key], y_train_dict[key])
    dt_time[key] = time.time() - st

In [28]:
dt_time

{400000: 1.048037052154541,
 1000: 0.0037539005279541016,
 10000: 0.016834020614624023,
 100000: 0.19215893745422363,
 800000: 2.2114639282226562,
 1200000: 3.7533140182495117}

In [27]:
for key in train_dict:
    y_pred = dt_dict[key].predict(x_test_dict[key])
    print(key, accuracy_score(y_pred, y_test_dict[key]))

full 0.7066627091114464
1000 0.6733333333333333
10000 0.688
100000 0.7007666666666666


### Random Forests


In [29]:
from sklearn.ensemble import RandomForestClassifier

rf_dict = {}

rf_time = {}

for key in train_dict:
    st = time.time()
    rf_dict[key] = RandomForestClassifier()
    rf_dict[key].fit(x_train_dict[key], y_train_dict[key])
    rf_time[key] = time.time() - st



In [30]:
rf_time

{400000: 5.257158279418945,
 1000: 0.012971639633178711,
 10000: 0.06613922119140625,
 100000: 0.8663570880889893,
 800000: 14.837876796722412,
 1200000: 25.868048191070557}

In [30]:
for key in train_dict:
    y_pred = rf_dict[key].predict(x_test_dict[key])
    print(key, accuracy_score(y_pred, y_test_dict[key]))

full 0.7080313636251205
1000 0.6966666666666667
10000 0.6876666666666666
100000 0.7030333333333333
