In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2, RFE, SelectFromModel, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import ada

In [2]:
model_to_use = "svm"
def model_creator(model_to_use):
    if model_to_use == "cart":
        return tree.DecisionTreeClassifier()
    elif model_to_use == "random_forest":
        return ensemble.RandomForestClassifier()
    elif model_to_use == "xgboost":
        return ensemble.GradientBoostingClassifier()
    elif model_to_use == "nonlinearSVM":
        return svm.SVC(kernel="rbf")
    else:
        return svm.SVC(kernel= 'linear', random_state=42)

In [3]:
base = pd.read_csv("football/football.csv", index_col=0)
base.drop(columns=['date'], inplace=True)
base = pd.get_dummies(base)
base = base.apply(pd.to_numeric, errors='coerce')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(base)
base = imp.transform(base)
X_train, X_test, y_train, y_test = train_test_split(base[:, 1:], base[:, 0], test_size=0.3, random_state=42)

In [4]:
%%time
model = model_creator(model_to_use)
model.fit(X_train, y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(X_train)))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(X_test)))

Accuracy train: 0.682
Accuracy test: 0.549
Wall time: 98.7 ms


In [5]:
%%time
connections = pd.read_csv("football/connections.csv")
b = pd.read_csv("football/football.csv")
for i in range(len(connections)):
    b = b.merge(pd.read_csv(f"football/{connections['to_table'][i]}"), left_on=connections["from_key"][i], right_on=connections["to_key"][i], suffixes=('', 'y'))
b = b[b.columns.drop(list(b.filter(regex='_id')))]
b.drop(columns=['date'], inplace=True)
b = pd.get_dummies(b)
keys = b.keys()
b = b.apply(pd.to_numeric, errors='coerce')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(b)
b = imp.transform(b)
b = pd.DataFrame(b, columns=keys)
X_train, X_test, y_train, y_test = train_test_split(b.drop(columns=['class']), b['class'], test_size=0.3, random_state=42)

Wall time: 239 ms


In [6]:
%%time
model = model_creator(model_to_use)
model.fit(X_train, y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(X_train)))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(X_test)))

Accuracy train: 1.000
Accuracy test: 1.000
Wall time: 9.68 s


In [7]:
%%time
X_new = SelectKBest(f_classif, k=4).fit_transform(b.drop(columns=['class']), b['class'])
X_train, X_test, y_train, y_test = train_test_split(X_new, b['class'], test_size=0.3, random_state=42)
model = model_creator(model_to_use)
model.fit(X_train, y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(X_train)))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(X_test)))

Accuracy train: 1.000
Accuracy test: 1.000
Wall time: 29.9 ms


In [8]:
%%time
base = pd.read_csv("football/football.csv", index_col=0)
base.drop(columns=['date'], inplace=True)
connections = pd.read_csv("football/connections.csv")
tables = []
for i in range(len(connections)):
    tables.append(pd.read_csv(f"football/{connections['to_table'][i]}"))
distribution = base.groupby("class").count()["v1"].sort_values(ascending=False)
proportion = distribution.iloc[0] / (distribution.iloc[0] + distribution.iloc[1])
table = ada.ada(base, tables, connections, proportion)
base = pd.get_dummies(table)
X_train, X_test, y_train, y_test = train_test_split(base.drop(columns=["class"]), base["class"], test_size=0.3, random_state=42)
model = model_creator(model_to_use)
model.fit(X_train, y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(X_train)))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(X_test)))

Accuracy train: 1.000
Accuracy test: 1.000
Wall time: 8.67 s
