In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_absolute_error, accuracy_score
from scipy.sparse import hstack
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.decomposition import TruncatedSVD

  from numpy.core.umath_tests import inner1d


In [24]:
def get_sum(text):
    nums = [int(x) for x in re.findall('\d+', text)]
    if nums:
        return float(sum(nums))
    else:
        return 0

def get_product(text):
    nums = [int(x) for x in re.findall('\d+', text)]
    if nums:
        prod = 1
        for num in nums:
            prod *= num
        return float(prod)
    else:
        return 0    

def get_aminusb(text):
    nums = [int(x) for x in re.findall('\d+', text)]
    if len(nums) == 2:
        res = float(nums[0] - nums[1])
        if res:
            return res
        else:
            return 0.
    else:
        return 0.
    
def get_bminusa(text):
    nums = [int(x) for x in re.findall('\d+', text)]
    if len(nums) == 2:
        res = float(nums[1] - nums[0])
        if res:
            return res
        else:
            return 0.
    else:
        return 0.
    
def get_adevideb(text):
    nums = [int(x) for x in re.findall('\d+', text)]
    if len(nums) == 2:
        res = round(nums[0]/nums[1])
        if res:
            return res
        else:
            return 0.
    else:
        return 0.
    
def get_bdevidea(text):
    nums = [int(x) for x in re.findall('\d+', text)]
    if len(nums) == 2:
        res = round(nums[1]/nums[0])
        if res:
            return res
        else:
            return 0.
    else:
        return 0.



In [4]:
pd.set_option('display.max_colwidth', 1000)

In [17]:
data_train = pd.read_csv('augmented_sents_train.csv', sep='\t', quotechar="'", 
                         names=['context', 'question', 'answer'])
data_test = pd.read_csv('augmented_sents_test.csv', sep='\t', quotechar="'", 
                        names=['context', 'question', 'answer'])

In [18]:
data_train.dropna(inplace=True)
data_test.dropna(inplace=True)

In [172]:
data_train.head(5)

Unnamed: 0,context,question,answer,text
0,Витя и Катя взяли из пакетика 40 конфетки.,"Сколько взяла Катя, если Витя взял 34?",6,"Витя и Катя взяли из пакетика 40 конфетки.Сколько взяла Катя, если Витя взял 34?"
1,Витя и Катя взяли из пакетика 49 конфетки.,"Сколько взяла Катя, если Витя взял 40?",9,"Витя и Катя взяли из пакетика 49 конфетки.Сколько взяла Катя, если Витя взял 40?"
2,Витя и Катя взяли из пакетика 48 конфетки.,"Сколько взяла Катя, если Витя взял 33?",15,"Витя и Катя взяли из пакетика 48 конфетки.Сколько взяла Катя, если Витя взял 33?"
3,Витя и Катя взяли из пакетика 38 конфетки.,"Сколько взяла Катя, если Витя взял 18?",20,"Витя и Катя взяли из пакетика 38 конфетки.Сколько взяла Катя, если Витя взял 18?"
4,Витя и Катя взяли из пакетика 28 конфетки.,"Сколько взяла Катя, если Витя взял 10?",18,"Витя и Катя взяли из пакетика 28 конфетки.Сколько взяла Катя, если Витя взял 10?"


In [20]:
tfidf = TfidfVectorizer(max_features=10000)
X_context_train = tfidf.fit_transform(data_train['context'])
X_question_train = tfidf.transform(data_train['question'])

X_context_test = tfidf.transform(data_test['context'])
X_question_test = tfidf.transform(data_test['question'])

In [21]:
X_train = hstack([X_context_train, X_question_train])
X_train = X_train.tocsr()

X_test = hstack([X_context_test, X_question_test])
X_test = X_test.tocsr()

# 1. Сумма всех чисел в тексте задачи

In [34]:
sum_all_train = data_train.text.apply(get_sum)
sum_all_test = data_test.text.apply(get_sum)
print(accuracy_score(y_train, sum_all_train))
print(accuracy_score(y_test, sum_all_test))

0.17899603698811095
0.0821917808219178


# 2. Произведение всех чисел в тексте задачи

In [35]:
product_all_train = data_train.text.apply(get_product)
product_all_test = data_test.text.apply(get_product)
print(accuracy_score(y_train, product_all_train))
print(accuracy_score(y_test, product_all_test))

0.10766182298546896
0.1113013698630137


## 3. Текстовые признаки + признаки на извлеченных числах

In [37]:
data_train['text'] = data_train['context'] + data_train['question']

sums = data_train.text.apply(get_sum).values.reshape(-1, 1)
product = data_train.text.apply(get_product).values.reshape(-1, 1)
aminusb = data_train.text.apply(get_aminusb).values.reshape(-1, 1)
bminusa = data_train.text.apply(get_bminusa).values.reshape(-1, 1)

adevideb = data_train.text.apply(get_adevideb).values.reshape(-1, 1)
bdevidea = data_train.text.apply(get_bdevidea).values.reshape(-1, 1)

sums_context = data_train.context.apply(get_sum).values.reshape(-1, 1)
product_context = data_train.context.apply(get_product).values.reshape(-1, 1)
aminusb_context = data_train.context.apply(get_aminusb).values.reshape(-1, 1)
bminusa_context = data_train.context.apply(get_bminusa).values.reshape(-1, 1)

adevideb_context = data_train.context.apply(get_adevideb).values.reshape(-1, 1)
bdevidea_context = data_train.context.apply(get_bdevidea).values.reshape(-1, 1)

sums_question = data_train.question.apply(get_sum).values.reshape(-1, 1)
product_question = data_train.question.apply(get_product).values.reshape(-1, 1)
aminusb_question = data_train.question.apply(get_aminusb).values.reshape(-1, 1)
bminusa_question = data_train.question.apply(get_bminusa).values.reshape(-1, 1)

adevideb_question = data_train.question.apply(get_adevideb).values.reshape(-1, 1)
bdevidea_question = data_train.question.apply(get_bdevidea).values.reshape(-1, 1)

X_train_with_manual = hstack([X_train, sums, aminusb, bminusa, 
                        product, adevideb, bdevidea,
                       sums_context, aminusb_context, bminusa_context, 
                        product_context, adevideb_context, bdevidea_context,
                       sums_question, aminusb_question, bminusa_question, 
                        product_question, adevideb_question, bdevidea_question])

In [129]:
data_test['text'] = data_test['context'] + data_test['question']

sums = data_test.text.apply(get_sum).values.reshape(-1, 1)
product = data_test.text.apply(get_product).values.reshape(-1, 1)
aminusb = data_test.text.apply(get_aminusb).values.reshape(-1, 1)
bminusa = data_test.text.apply(get_bminusa).values.reshape(-1, 1)

adevideb = data_test.text.apply(get_adevideb).values.reshape(-1, 1)
bdevidea = data_test.text.apply(get_bdevidea).values.reshape(-1, 1)

sums_context = data_test.context.apply(get_sum).values.reshape(-1, 1)
product_context = data_test.context.apply(get_product).values.reshape(-1, 1)
aminusb_context = data_test.context.apply(get_aminusb).values.reshape(-1, 1)
bminusa_context = data_test.context.apply(get_bminusa).values.reshape(-1, 1)

adevideb_context = data_test.context.apply(get_adevideb).values.reshape(-1, 1)
bdevidea_context = data_test.context.apply(get_bdevidea).values.reshape(-1, 1)

sums_question = data_test.question.apply(get_sum).values.reshape(-1, 1)
product_question = data_test.question.apply(get_product).values.reshape(-1, 1)
aminusb_question = data_test.question.apply(get_aminusb).values.reshape(-1, 1)
bminusa_question = data_test.question.apply(get_bminusa).values.reshape(-1, 1)

adevideb_question = data_test.question.apply(get_adevideb).values.reshape(-1, 1)
bdevidea_question = data_test.question.apply(get_bdevidea).values.reshape(-1, 1)

X_test_with_manual = hstack([X_test, sums, aminusb, bminusa, 
                        product, adevideb, bdevidea,
                       sums_context, aminusb_context, bminusa_context, 
                        product_context, adevideb_context, bdevidea_context,
                       sums_question, aminusb_question, bminusa_question, 
                        product_question, adevideb_question, bdevidea_question])

In [130]:
X_train_with_manual = X_train_with_manual.tocsr()
X_test_with_manual = X_test_with_manual.tocsr()

In [131]:
y_train = data_train['answer'].values
y_test = data_test['answer'].values

y_train_bin = data_train['answer'].apply(lambda x: 1 if x == -1 else 0).values
y_test_bin = data_test['answer'].apply(lambda x: 1 if x == -1  else 0).values

In [166]:
svd = TruncatedSVD(30)
X_train_with_manual_svd = svd.fit_transform(X_train_with_manual)
X_test_with_manual_svd = svd.transform(X_test_with_manual)


### 3.1 Определение некорректных вопросов

In [168]:
from sklearn.linear_model import LogisticRegressionCV, RidgeCV
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report

In [184]:
clf = LogisticRegressionCV(class_weight='balanced', random_state=42)
clf.fit(X_train_with_manual, y_train_bin)
preds = clf.predict(X_test_with_manual)
print(classification_report(y_test_bin, preds))

clf = LogisticRegressionCV(class_weight='balanced', random_state=42)
clf.fit(X_train_with_manual_svd, y_train_bin)
preds = clf.predict(X_test_with_manual_svd)
print(classification_report(y_test_bin, preds))

             precision    recall  f1-score   support

          0       0.77      1.00      0.87       433
          1       0.96      0.17      0.28       151

avg / total       0.82      0.78      0.72       584

             precision    recall  f1-score   support

          0       0.79      0.80      0.80       433
          1       0.41      0.41      0.41       151

avg / total       0.70      0.70      0.70       584



In [185]:
clf = DecisionTreeClassifier(max_depth=14, class_weight='balanced', random_state=42)
clf.fit(X_train_with_manual, y_train_bin)
preds = clf.predict(X_test_with_manual)
print(classification_report(y_test_bin, preds))

clf = DecisionTreeClassifier(max_depth=14, class_weight='balanced', random_state=42)
clf.fit(X_train_with_manual_svd, y_train_bin)
preds = clf.predict(X_test_with_manual_svd)
print(classification_report(y_test_bin, preds))


             precision    recall  f1-score   support

          0       0.82      0.89      0.85       433
          1       0.59      0.43      0.50       151

avg / total       0.76      0.77      0.76       584

             precision    recall  f1-score   support

          0       0.75      0.89      0.81       433
          1       0.29      0.13      0.18       151

avg / total       0.63      0.70      0.65       584



In [186]:
clf = RandomForestClassifier(max_depth=4, n_estimators=100, class_weight='balanced', random_state=42)
clf.fit(X_train_with_manual, y_train_bin)
preds = clf.predict(X_test_with_manual)
print(classification_report(y_test_bin, preds))

clf = RandomForestClassifier(max_depth=4, n_estimators=100, class_weight='balanced', random_state=42)
clf.fit(X_train_with_manual_svd, y_train_bin)
preds = clf.predict(X_test_with_manual_svd)
print(classification_report(y_test_bin, preds))

             precision    recall  f1-score   support

          0       0.81      0.99      0.89       433
          1       0.91      0.32      0.48       151

avg / total       0.83      0.82      0.78       584

             precision    recall  f1-score   support

          0       0.76      0.84      0.80       433
          1       0.36      0.26      0.30       151

avg / total       0.66      0.69      0.67       584



In [192]:
clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(X_train_with_manual, y_train_bin)
preds_bin = clf.predict(X_test_with_manual)
print(classification_report(y_test_bin, preds_bin))

clf = KNeighborsClassifier(n_neighbors=10)
clf.fit(X_train_with_manual_svd, y_train_bin)
preds_bin = clf.predict(X_test_with_manual_svd)
print(classification_report(y_test_bin, preds_bin))

             precision    recall  f1-score   support

          0       0.77      1.00      0.87       433
          1       0.96      0.15      0.25       151

avg / total       0.82      0.78      0.71       584

             precision    recall  f1-score   support

          0       0.77      1.00      0.87       433
          1       0.92      0.15      0.25       151

avg / total       0.81      0.78      0.71       584



### 3.2 Предсказание точного ответа

In [193]:
# возьмем предсказание лучшей модели определения некорректных вопросов
clf = RandomForestClassifier(max_depth=4, n_estimators=100, class_weight='balanced', random_state=42)
clf.fit(X_train_with_manual, y_train_bin)
preds_bin = clf.predict(X_test_with_manual)

In [194]:
# удалим из обучающей выборки некорректные вопросы
only_good_train_X = X_train_with_manual[y_train != -1]
only_good_train_y = y_train[y_train != -1]

# удалим из обучающей выборки некорректные вопросы
only_good_train_X_svd = X_train_with_manual_svd[y_train != -1]

In [200]:
# обучим регрессию на корректных вопросах 
regr = RidgeCV()
scaler = StandardScaler(with_mean=False)
scaler.fit(only_good_train_X, )

regr.fit(scaler.transform(only_good_train_X), only_good_train_y)
# округлим предсказания
preds = (regr.predict(scaler.transform(X_test_with_manual))).astype(int)
# добавим предсказания некорректных вопросов
preds[preds_bin == 1] = -1
print(accuracy_score(y_test, preds))


# обучим регрессию на корректных вопросах 
regr = RidgeCV()
scaler = StandardScaler(with_mean=False)
scaler.fit(only_good_train_X_svd, )

regr.fit(scaler.transform(only_good_train_X_svd), only_good_train_y)
# округлим предсказания
preds = (regr.predict(scaler.transform(X_test_with_manual_svd))).astype(int)
# добавим предсказания некорректных вопросов
preds[preds_bin == 1] = -1
print(accuracy_score(y_test, preds))

0.0839041095890411
0.08904109589041095


In [203]:
# data_test.iloc[preds==y_test]

In [204]:
# обучим регрессию на корректных вопросах 
regr = RandomForestRegressor(random_state=42)
scaler = StandardScaler(with_mean=False)
scaler.fit(only_good_train_X, )

regr.fit(scaler.transform(only_good_train_X), only_good_train_y)
# округлим предсказания
preds = (regr.predict(scaler.transform(X_test_with_manual))).astype(int)
# добавим предсказания некорректных вопросов
preds[preds_bin == 1] = -1
print(accuracy_score(y_test, preds))


# обучим регрессию на корректных вопросах 
regr = RandomForestRegressor(random_state=42)
scaler = StandardScaler(with_mean=False)
scaler.fit(only_good_train_X_svd, )

regr.fit(scaler.transform(only_good_train_X_svd), only_good_train_y)
# округлим предсказания
preds = (regr.predict(scaler.transform(X_test_with_manual_svd))).astype(int)
# добавим предсказания некорректных вопросов
preds[preds_bin == 1] = -1
print(accuracy_score(y_test, preds))

0.09075342465753425
0.0839041095890411


In [206]:
# data_test.iloc[preds==y_test]

In [207]:
# обучим регрессию на корректных вопросах 
regr = KNeighborsRegressor(n_neighbors=3)
scaler = StandardScaler(with_mean=False)
scaler.fit(only_good_train_X, )

regr.fit(scaler.transform(only_good_train_X), only_good_train_y)
# округлим предсказания
preds = (regr.predict(scaler.transform(X_test_with_manual))).astype(int)
# добавим предсказания некорректных вопросов
preds[preds_bin == 1] = -1
print(accuracy_score(y_test, preds))


# обучим регрессию на корректных вопросах 
regr = KNeighborsRegressor(n_neighbors=3)
scaler = StandardScaler(with_mean=False)
scaler.fit(only_good_train_X_svd, )

regr.fit(scaler.transform(only_good_train_X_svd), only_good_train_y)
# округлим предсказания
preds = (regr.predict(scaler.transform(X_test_with_manual_svd))).astype(int)
# добавим предсказания некорректных вопросов
preds[preds_bin == 1] = -1
print(accuracy_score(y_test, preds))

0.08904109589041095
0.0839041095890411


In [209]:
# data_test.iloc[preds==y_test]