In [None]:
import pandas as pd
import pickle
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold
from numpy import mean, absolute
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
# load data
infile = open("../data_solution_probability_model.pkl", "rb")
df = pickle.load(infile)
infile.close()

In [None]:
# prepare features
feature_cols = list(df.columns)
feature_cols.remove("Erfolg")

X = df[feature_cols]
y = df.Erfolg
y = y.astype("int")

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
k = 5
cv = KFold(n_splits=k, random_state=None)

# fit
clf = SVC(kernel="linear", gamma=1, probability=True)
clf = clf.fit(X_train, y_train)

# calculate scores
scores_a = cross_val_score(clf, X_train, y_train, scoring="accuracy", cv=cv, n_jobs=-1)
a = mean(scores_a)

scores_p = cross_val_score(clf, X_train, y_train, scoring="precision", cv=cv, n_jobs=-1)
p = mean(scores_p)

scores_r = cross_val_score(clf, X_train, y_train, scoring="recall", cv=cv, n_jobs=-1)
r = mean(scores_r)

scores_f1 = cross_val_score(clf, X_train, y_train, scoring="f1", cv=cv, n_jobs=-1)
f1_cv = mean(scores_f1)

# Print the cross validation scores
# print("Accuracy:", a)
# print("Precision:", p)
# print("Recall:", r)
# print("F1 score:", f1_cv)

# Evaluate the model on the test set
y_pred = clf.predict(X_test)

# Calculate the evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
# print("Accuracy:", accuracy)
# print("Precision:", precision)
# print("Recall:", recall)
# print("F1 score:", f1)

# predict probabilities
probs = clf.predict_proba(X_test)

In [None]:
# print confusion matrix
metrics.confusion_matrix(y_test, y_pred)

In [None]:
# save probabilities as list
prob_list = probs[:, :1].tolist()
data_probs = pd.DataFrame(prob_list)

In [None]:
# dump as pickle
pickle.dump(clf, open("SVMmodel_3months.pkl", "wb"))
pickle.dump(X_train, open("X_train_3months.pkl", "wb"))
pickle.dump(X_test, open("X_test_3months.pkl", "wb"))
pickle.dump(y_train, open("y_train_3months.pkl", "wb"))
pickle.dump(y_test, open("y_test_3months.pkl", "wb"))
pickle.dump(data_probs, open("df_prob_3months.pkl", "wb"))