In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2, RFE, SelectFromModel, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import ada

In [2]:
model_to_use = "xgboost"
def model_creator(model_to_use):
    if model_to_use == "cart":
        return tree.DecisionTreeClassifier()
    elif model_to_use == "random_forest":
        return ensemble.RandomForestClassifier()
    elif model_to_use == "xgboost":
        return ensemble.GradientBoostingClassifier()
    elif model_to_use == "nonlinearSVM":
        return svm.SVC(kernel="rbf")
    else:
        return svm.SVC(kernel= 'linear', random_state=42)

In [3]:
base = pd.read_csv("steel-plate-fault/steel_plate_fault.csv", index_col=0)
base = pd.get_dummies(base)
keys = base.keys()
base = base.apply(pd.to_numeric, errors='coerce')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(base)
base = imp.transform(base)
base = pd.DataFrame(base, columns=keys)
X_train, X_test, y_train, y_test = train_test_split(base.drop(columns=['class']), base['class'], test_size=0.3, random_state=42)

In [4]:
%%time
scaler = StandardScaler()
scaler.fit(X_train)
model = model_creator(model_to_use)
model.fit(scaler.transform(X_train), y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(scaler.transform(X_train))))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(scaler.transform(X_test))))

Accuracy train: 0.856
Accuracy test: 0.789
Wall time: 249 ms


In [5]:
%%time
connections = pd.read_csv("steel-plate-fault/connections.csv")
b = pd.read_csv("steel-plate-fault/steel_plate_fault.csv")
for i in range(len(connections)):
    b = b.merge(pd.read_csv(f"steel-plate-fault/{connections['to_table'][i]}"), left_on=connections["from_key"][i], right_on=connections["to_key"][i], suffixes=('', 'y'))

b = b[b.columns.drop(list(b.filter(regex='_id')))]
b = pd.get_dummies(b)
keys = b.keys()
b = b.apply(pd.to_numeric, errors='coerce')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(b)
b = imp.transform(b)
b = pd.DataFrame(b, columns=keys)

Wall time: 84.8 ms


In [6]:
%%time
X_train, X_test, y_train, y_test = train_test_split(b.drop(columns=['class']), b["class"], test_size=0.3, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
clf = model_creator(model_to_use)
clf.fit(scaler.transform(X_train), y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, clf.predict(scaler.transform(X_train))))
print('Accuracy test: %.3f' % accuracy_score(y_test, clf.predict(scaler.transform(X_test))))

Accuracy train: 1.000
Accuracy test: 1.000
Wall time: 1.34 s


In [7]:
%%time
X_new = SelectKBest(f_classif, k=25).fit_transform(b.drop(columns=['class']), b["class"])
X_train, X_test, y_train, y_test = train_test_split(X_new, b["class"], test_size=0.3, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
model = model_creator(model_to_use)
model.fit(scaler.transform(X_train), y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(scaler.transform(X_train))))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(scaler.transform(X_test))))

Accuracy train: 1.000
Accuracy test: 1.000
Wall time: 1.01 s


In [8]:
np.seterr(divide='ignore', invalid='ignore')
base = pd.read_csv("steel-plate-fault/steel_plate_fault.csv", index_col=0)
connections = pd.read_csv("steel-plate-fault/connections.csv")
tables = []
for i in range(len(connections)):
    tables.append(pd.read_csv(f"steel-plate-fault/{connections['to_table'][i]}").fillna(method='backfill', axis=1))
distribution = base.groupby("class").count()["v1"].sort_values(ascending=False)
proportion = distribution.iloc[0] / (distribution.iloc[0] + distribution.iloc[1])
tables[0]

Unnamed: 0,type_of_fault_id,v9,v10,v11,v12,v13,v14,steel_plate_fault_id
0,1053,1,0,0,0,0,0,629535
1,7880,1,0,0,0,0,0,114895
2,12765,1,0,0,0,0,0,53197
3,12226,1,0,0,0,0,0,764094
4,12838,1,0,0,0,0,0,575797
...,...,...,...,...,...,...,...,...
1936,11357,0,0,0,0,0,0,369828
1937,15616,0,0,0,0,0,0,806310
1938,15047,0,0,0,0,0,0,241219
1939,8680,0,0,0,0,0,0,556248


In [9]:
%%time
table = ada.ada(base, tables, connections, proportion)
base = pd.get_dummies(table)
base = base.fillna(method='backfill', axis=1)
X_train, X_test, y_train, y_test = train_test_split(base.drop(columns=["class"]), base["class"], test_size=0.3, random_state=42)
scaler = StandardScaler()
scaler.fit(X_train)
model = model_creator(model_to_use)
model.fit(scaler.transform(X_train), y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(scaler.transform(X_train))))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(scaler.transform(X_test))))

Accuracy train: 0.995
Accuracy test: 0.985
Wall time: 2.12 s
