In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from collections import Counter
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.linear_model import LinearRegression, RidgeCV, ElasticNet, Ridge, Lasso
from helper import *
import shap
shap.initjs()

In [None]:
def get_score(model, feature, target):
    pred = model.predict(feature)
    score = pd.DataFrame(pred, columns=['Prediction'])
    score['GroundTruth'] = np.array(target)
    score['AbsoluteError'] = abs(score['GroundTruth'] - score['Prediction'])
    return score


def get_doubtful_values(score):
    return score[score['AbsoluteError'] > 1.5]

def get_truthful_values(score):
    return score[score['AbsoluteError'] < 0.5]


In [None]:
data = pd.read_csv("data/data_cleaned.csv")
data = data.drop(["Unnamed: 0"], axis=1)


In [None]:
y = data["OverallQual"]
X = data.drop(["OverallQual", 'SalePrice'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,random_state=100, stratify=y )

ridge = RidgeCV()
ridge.fit(X_train, y_train)
print(ridge.score(X_test, y_test))
y_pred = ridge.predict(X)
y_score = get_score(ridge, X, y)
y_correct = get_truthful_values(y_score)

In [None]:
'''
Experiment:
- Original dataset, get doubtful value
- Corrupt this attrubute (the non-doubtful)
- See if the model discover the pattern.5
'''

explainer = shap.Explainer(ridge, X, algorithm="linear", seed=100)
shap_values = explainer(X)


In [None]:
data_correct = data.iloc[y_correct.index]
data_correct['data_index'] = data_correct.index
data_correct = data_correct.reset_index(drop=True)
data_correct['index'] = data_correct.index
data_correct.set_index(['index', 'data_index'], inplace=True)

In [None]:
corruption_list = []
old_new_value = []
OverallQual_sample_fraction = 0.03
for i in data_correct.sample(frac = OverallQual_sample_fraction).index:
    old = data_correct.loc[i, 'OverallQual']
    new = np.random.randint(1, 11, 1)[0]
    while abs(old - new) < 3:
        new = np.random.randint(1, 11, 1)[0]
    data_correct.loc[i, 'OverallQual'] = new
    corruption_list.append(i)
    old_new_value.append((old, new))


In [None]:
y_corrupted = data_correct["OverallQual"]
X_corrupted = data_correct.drop(["OverallQual", 'SalePrice'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X_corrupted, y_corrupted, test_size=0.1,random_state=1)

ridge = RidgeCV()
ridge.fit(X_train, y_train)
print(ridge.score(X_test, y_test))



In [22]:
y_corrupted_score = get_score(ridge, X_corrupted, y_corrupted)
y_corrupted_doubt = get_doubtful_values(y_corrupted_score)
y_corrupted_doubt['OrgGroundTruth'] = y[y_corrupted_doubt.index]
y_corrupted_doubt['Corrupted'] = y_corrupted_doubt['OrgGroundTruth'] != y_corrupted_doubt['GroundTruth']
y_corrupted_doubt['OrgPredict'] = y_score.loc[y_corrupted_doubt.index]['Prediction']

y_corrupted_doubt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Prediction,GroundTruth,AbsoluteError,OrgGroundTruth,Corrupted,OrgPredict
13,7.661861,4,3.661861,7,True,7.602317
57,7.808176,5,2.808176,7,True,6.917212
74,5.259113,2,3.259113,3,True,4.793183
82,7.716001,4,3.716001,8,True,7.155262
97,6.751688,1,5.751688,4,True,5.320496
122,7.351806,10,2.648194,6,True,5.056919
132,5.025599,8,2.974401,5,True,5.460865
150,6.460713,10,3.539287,5,True,5.087732
155,5.651058,2,3.651058,6,True,4.353812
192,7.098166,10,2.901834,7,True,6.913467


In [23]:
y_corrupted_doubt[y_corrupted_doubt['Corrupted'] == False]

Unnamed: 0,Prediction,GroundTruth,AbsoluteError,OrgGroundTruth,Corrupted,OrgPredict
194,8.855968,5,3.855968,5,False,4.982176
981,5.317158,8,2.682842,8,False,8.09335
1373,7.190756,10,2.809244,10,False,9.297857


In [None]:
explainer_corruption = shap.Explainer(ridge, X_corrupted, algorithm="linear", seed=100)
shap_values_corruption = explainer_corruption(X_corrupted)
