In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support
import ast
LABEL_SHORT = "short"
LABEL_LONG = "long"
THRESHOLD = 21
SEED = 42

## Auxiliar Functions

In [2]:
def plot_cm(cm, filename):
    '''
        Takes in a confusion matrix and saves it as a PNG image.
    '''
    plt.imshow(cm, cmap=plt.cm.Reds)
    plt.tight_layout()
    plt.ylabel('True label', fontsize=6)
    plt.xlabel('Predicted label', fontsize=6)
    plt.xticks(np.arange(len(Config.LABELS)), Config.LABELS, fontsize=8)
    plt.yticks(np.arange(len(Config.LABELS)), Config.LABELS, fontsize=8)

    thresh = cm.max() * 0.75
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], 'd'),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.savefig(filename, dpi=300)
    plt.close()

def get_response_time_label(time):
    if time < THRESHOLD:
        return LABEL_SHORT
    else:
        return LABEL_LONG

def read_dataset(filename):
    dtypes = {"response_time_sec": np.int32, "session_id": np.int32}
    converters = {"question": ast.literal_eval, "response": ast.literal_eval}
    path = filename
    data = pd.read_csv(path, sep=",", header=0, dtype=dtypes, converters=converters)
    return data

def dummy_tokenizer(tokens):
    return tokens

# Clarification Request Classifier

## Load dataset

In [3]:
dataset_path = "cs224u-project/data/dataset_qr/"

In [4]:
train_df = read_dataset(dataset_path + 'train_question_text_and_response_text_dataset.csv')
test_df = read_dataset(dataset_path + 'test_question_text_and_response_text_dataset.csv')
dev_df = read_dataset(dataset_path + 'dev_question_text_and_response_text_dataset.csv')
tiny_df = read_dataset(dataset_path + 'tiny_question_text_and_response_text_dataset.csv')

In [5]:
data = pd.DataFrame()
for file in os.listdir(dataset_path):
    df = read_dataset(dataset_path+file)
    data = data.append(df)

In [6]:
data = data.reset_index(drop=True)

In [7]:
data['cr'] = data.apply(lambda x: 1 if '?' in x.response else 0, axis=1)

In [8]:
cr_df = data[data.cr == 1]

In [9]:
not_cr_df = data[data.cr == 0].sample(n=cr_df.shape[0])

In [10]:
cr_df[cr_df.cr == 1].sort_values('response_time_sec', ascending=False)

Unnamed: 0,question,response,response_time_sec,session_id,cr
112709,"[Awesome, !, So, from, that, what, is, angle, ...","[Hi, are, u, still, there, ?]",2356,318083,1
163314,"[Okay, ., No, worries, !, How, far, you, have,...","[hello, ?]",832,324841,1
25827,"[<url>, Did, you, receive, the, image, ?]",[?],708,305314,1
157329,"[Do, n't, worry, ;, I, will, guide, you, ., Wh...","[Multiply, ?]",682,324068,1
47566,"[In, problem, number, 9, ,, did, you, first, d...","[, Hey, ,, are, you, still, here, ?, I, got, d...",635,309737,1
325153,"[This, means, that, when, we, simplify, ,, we,...","[?, ?]",439,318102,1
398443,"[Very, good, !, , Now, ,, what, about, "", a, ""...","[7, and, 9, ?]",422,325028,1
112510,"[Yes, ., That, 's, correct, ., We, need, to, s...","[hm, .., would, nt, it, be, r, =, sqrt((l^2/pi...",408,318067,1
161067,"[Yes, ,, we, add, the, downpayment, +, ten, in...","[part, a, is, the, same, formula, as, b, ?]",377,324522,1
431478,"[Not, a, problem, !, We, got, vertical, asympt...","[wym, as, y, tends, to, large, value, ?]",370,342965,1


In [11]:
cr_df['target'] = cr_df.response_time_sec.apply(get_response_time_label)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [12]:
cr_df = cr_df.append(not_cr_df, ignore_index=True)
Counter(cr_df[cr_df.cr == 1]['target'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Counter({'long': 29882, 'short': 16499})

In [13]:
X_train, X_test, y_train, y_test = train_test_split(cr_df.question, cr_df.cr, test_size=0.33, random_state=SEED)

In [14]:
models = {}
best_f1 = 0
best_grid = {}
report = []
cm = []

params = dict([
    #('clf__C', [0.001, 0.01, 0.1, 0.5, 1]),
    ('clf__C', [0.5]),
    #('clf__ngram_range', [(1,1), (1,2), (1,3)]),
    #('clf__penalty', ['l2', 'l1']),
    ('clf__penalty', ['l2']),
])

pipe = Pipeline([
    ('vect', CountVectorizer(tokenizer=dummy_tokenizer, lowercase=False, stop_words='english', ngram_range=(1,3))),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression(class_weight='balanced', random_state=SEED, solver='saga', n_jobs=4))
])

for g in ParameterGrid(params):
    pipe.set_params(**g)
    pipe.fit(X_train, y_train)
    preds = pipe.predict(X_test)
    p, r, f, s = precision_recall_fscore_support(y_test, preds, average='weighted')
    print(g)
    print(f)
    if f > best_f1:
        best_f1 = f
        best_grid = g
        report = classification_report(y_test, preds)
        cm = confusion_matrix(y_test, preds)
print("Logistic Regression: ")
print(best_grid)
print(report)
models['Logistic Regression'] = best_grid

  'stop_words.' % sorted(inconsistent))


{'clf__C': 0.5, 'clf__penalty': 'l2'}
0.6127324625009728
Logistic Regression: 
{'clf__C': 0.5, 'clf__penalty': 'l2'}
              precision    recall  f1-score   support

           0       0.63      0.55      0.59     15241
           1       0.60      0.68      0.64     15371

   micro avg       0.61      0.61      0.61     30612
   macro avg       0.62      0.61      0.61     30612
weighted avg       0.62      0.61      0.61     30612



In [15]:
params = dict([
    ('clf__C', [0.01, 1]),
    #('clf__loss', ['hinge', 'squared_hinge']),
])

pipe2 = Pipeline([
        ('vect', CountVectorizer(tokenizer=dummy_tokenizer, lowercase=False, ngram_range=(1,3))),
        ('tfidf', TfidfTransformer()),
        ('clf', LinearSVC(class_weight='balanced', random_state=SEED))
])

best_f1 = 0
best_grid = {}
report = []
cm = []
for g in ParameterGrid(params):
    pipe2.set_params(**g)
    pipe2.fit(X_train, y_train)
    preds = pipe2.predict(X_test)
    p, r, f, s = precision_recall_fscore_support(y_test, preds, average='weighted')
    print(g)
    print(f)
    if f > best_f1:
        best_f1 = f
        best_grid = g
        report = classification_report(y_test, preds)
        cm = confusion_matrix(y_test, preds)

print("Linear SVM: ")
print(best_grid)
print(report)

models['Linear SVM'] = best_grid

{'clf__C': 0.01}
0.6190285532119968
{'clf__C': 1}
0.5963270827321863
Linear SVM: 
{'clf__C': 0.01}
              precision    recall  f1-score   support

           0       0.64      0.54      0.59     15241
           1       0.61      0.70      0.65     15371

   micro avg       0.62      0.62      0.62     30612
   macro avg       0.62      0.62      0.62     30612
weighted avg       0.62      0.62      0.62     30612



In [16]:
data['cr_predicted'] = pipe2.predict(data.question)

## Tag dataset

In [17]:
def annotate_dataframes(data, pipe):
    data['cr_predicted'] = pipe.predict(data.question)
    data['target'] = data.response_time_sec.apply(get_response_time_label)
    data['len'] = data.question.apply(len)
    return data

In [18]:
train_df = annotate_dataframes(train_df, pipe)
test_df = annotate_dataframes(test_df, pipe)
dev_df = annotate_dataframes(dev_df, pipe)

In [19]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
class SklearnModel(object):
    def __init__(self, name, pipe, params_range):
        self.name = name
        self.pipe = pipe
        self.params_range = params_range

def text_and_scalar_pipe(scalar, clf):
    return Pipeline([
        ('union', FeatureUnion(
            transformer_list=[
                ('text', text_selector_pipe('question')),
                ('scalar', scalar_selector_pipe(scalar))
                ])),
        ('clf', clf)
        ])

def text_and_scalars_pipe(scalars, clf):
    transformer_list = [ ('question', text_selector_pipe('question'))]
    for scalar in scalars:
        transformer_list.append((scalar, scalar_selector_pipe(scalar)))
    return Pipeline([
        ('union', FeatureUnion(
            transformer_list = transformer_list
            )),
        ('clf', clf)
        ])

def text_selector_pipe(key):
    return Pipeline([
             ('select', ItemSelector(key=key)),
             ('vect', CountVectorizer(tokenizer=dummy_tokenizer, lowercase=False, ngram_range=(1,3))),
             ('tfidf', TfidfTransformer())
             ])

class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

def scalar_selector_pipe(key):
    return Pipeline([
                    ('select', ItemSelector(key=key)),
                    ('reshape', Reshape()),
                    ])

class Reshape(BaseEstimator, TransformerMixin):
    def fit(self, x, y=None):
        return self

    def transform(self, data):
        data = np.array(data)
        assert len(data.shape) == 1
        return data.reshape((-1, 1))

# SVM for time response classifier

In [20]:
svm_params = {'clf__C': np.logspace(-3,1,5), 'clf__loss': ['squared_hinge']}
SVMWithScalars = SklearnModel("svm", text_and_scalars_pipe(['cr_predicted'], LinearSVC(class_weight='balanced', random_state=SEED)), svm_params)

In [21]:
SVMWithScalars.pipe.fit(train_df[['question', 'cr_predicted']], train_df['target'])

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('question', Pipeline(memory=None,
     steps=[('select', ItemSelector(key='question')), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='co... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0))])

In [22]:
pred_test = SVMWithScalars.pipe.predict(test_df[['question', 'cr_predicted']])

In [23]:
report_test = classification_report(test_df['target'], pred_test)
print(report_test)

              precision    recall  f1-score   support

        long       0.62      0.66      0.64     33594
       short       0.61      0.57      0.59     30942

   micro avg       0.62      0.62      0.62     64536
   macro avg       0.62      0.61      0.61     64536
weighted avg       0.62      0.62      0.62     64536



In [24]:
pred_dev = SVMWithScalars.pipe.predict(dev_df[['question', 'cr_predicted']])
report_dev = classification_report(dev_df['target'], pred_dev)
print(report_dev)

              precision    recall  f1-score   support

        long       0.62      0.66      0.64     33439
       short       0.60      0.56      0.58     31237

   micro avg       0.61      0.61      0.61     64676
   macro avg       0.61      0.61      0.61     64676
weighted avg       0.61      0.61      0.61     64676



# Len + CR


In [25]:
SVMWithScalars2 = SklearnModel("svm", text_and_scalars_pipe(['cr_predicted', 'len'], LinearSVC(class_weight='balanced', random_state=SEED)), svm_params)

In [26]:
SVMWithScalars2.pipe.fit(train_df[['question', 'cr_predicted', 'len']], train_df['target'])



Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('question', Pipeline(memory=None,
     steps=[('select', ItemSelector(key='question')), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='co... max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0))])

In [27]:
pred_dev2 = SVMWithScalars2.pipe.predict(dev_df[['question', 'cr_predicted', 'len']])
report_dev2 = classification_report(dev_df['target'], pred_dev2)
print(report_dev2)

              precision    recall  f1-score   support

        long       0.57      0.82      0.68     33439
       short       0.64      0.35      0.45     31237

   micro avg       0.59      0.59      0.59     64676
   macro avg       0.61      0.58      0.56     64676
weighted avg       0.61      0.59      0.57     64676



In [28]:
pred_test2 = SVMWithScalars2.pipe.predict(test_df[['question', 'cr_predicted', 'len']])
report_test2 = classification_report(test_df['target'], pred_test2)
print(report_test2)

              precision    recall  f1-score   support

        long       0.58      0.83      0.68     33594
       short       0.66      0.36      0.46     30942

   micro avg       0.60      0.60      0.60     64536
   macro avg       0.62      0.59      0.57     64536
weighted avg       0.62      0.60      0.58     64536



In [29]:
log_params = {'clf__C': np.logspace(-3,0,3), 'clf__penalty': ['l2']}
LogisticWithScalars = SklearnModel("logistic", text_and_scalars_pipe(['cr_predicted'], LogisticRegression(class_weight='balanced', random_state=SEED)), log_params)

# LR for time response classifier

In [30]:
LogisticWithScalars.pipe.fit(train_df[['question', 'cr_predicted']], train_df['target'])



Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('question', Pipeline(memory=None,
     steps=[('select', ItemSelector(key='question')), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='co..., penalty='l2', random_state=42,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [31]:
pred_test = LogisticWithScalars.pipe.predict(test_df[['question', 'cr_predicted']])

In [32]:
report_test = classification_report(test_df['target'], pred_test)
print(report_test)

              precision    recall  f1-score   support

        long       0.64      0.70      0.67     33594
       short       0.63      0.57      0.60     30942

   micro avg       0.64      0.64      0.64     64536
   macro avg       0.64      0.63      0.63     64536
weighted avg       0.64      0.64      0.63     64536



In [33]:
pred_dev = LogisticWithScalars.pipe.predict(dev_df[['question', 'cr_predicted']])
report_dev = classification_report(dev_df['target'], pred_dev)
print(report_dev)

              precision    recall  f1-score   support

        long       0.63      0.69      0.66     33439
       short       0.63      0.56      0.59     31237

   micro avg       0.63      0.63      0.63     64676
   macro avg       0.63      0.63      0.63     64676
weighted avg       0.63      0.63      0.63     64676



# LR + CR + LEN

In [34]:
LogisticWithScalars2 = SklearnModel("logistic", text_and_scalars_pipe(['cr_predicted', 'len'], LogisticRegression(class_weight='balanced', random_state=SEED)), log_params)

In [35]:
LogisticWithScalars2.pipe.fit(train_df[['question', 'cr_predicted', 'len']], train_df['target'])

Pipeline(memory=None,
     steps=[('union', FeatureUnion(n_jobs=None,
       transformer_list=[('question', Pipeline(memory=None,
     steps=[('select', ItemSelector(key='question')), ('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='co..., penalty='l2', random_state=42,
          solver='warn', tol=0.0001, verbose=0, warm_start=False))])

In [36]:
pred_test2 = LogisticWithScalars2.pipe.predict(test_df[['question', 'cr_predicted', 'len']])

In [37]:
report_test2 = classification_report(test_df['target'], pred_test2)
print(report_test2)

              precision    recall  f1-score   support

        long       0.65      0.66      0.65     33594
       short       0.62      0.61      0.62     30942

   micro avg       0.64      0.64      0.64     64536
   macro avg       0.64      0.64      0.64     64536
weighted avg       0.64      0.64      0.64     64536



In [38]:
pred_dev2 = LogisticWithScalars2.pipe.predict(dev_df[['question', 'cr_predicted', 'len']])
report_dev2 = classification_report(dev_df['target'], pred_dev2)
print(report_dev2)

              precision    recall  f1-score   support

        long       0.64      0.65      0.65     33439
       short       0.62      0.61      0.61     31237

   micro avg       0.63      0.63      0.63     64676
   macro avg       0.63      0.63      0.63     64676
weighted avg       0.63      0.63      0.63     64676



# Dataset Analysis

In [40]:
from collections import Counter
Counter(cr_df[cr_df.cr == 1]['target'])

Counter({'long': 29882, 'short': 16499})

In [41]:
Counter(cr_df.cr)

Counter({1: 46381, 0: 46381})

In [42]:
Counter(train_df['target'])

Counter({'long': 156793, 'short': 146023})

In [43]:
Counter(test_df['target'])

Counter({'short': 30942, 'long': 33594})

In [44]:
Counter(dev_df['target'])

Counter({'short': 31237, 'long': 33439})