In [31]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [32]:
train_data = pd.read_csv('train.csv')
train_target = pd.read_csv('train_target.csv')
test_data = pd.read_csv('test.csv')

In [33]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2470 entries, 0 to 2469
Columns: 234 entries, agent_1_feat_Possession% to agent_2_featboth_scored_mean
dtypes: float64(212), int64(22)
memory usage: 4.4 MB


In [34]:
cols_with_nan = train_data.columns[train_data.isna().any()].tolist()
print(cols_with_nan)


['agent_1_feat_total_scored_mean', 'agent_1_feat_total_xg_mean', 'agent_1_feat_both_scored_mean', 'agent_1_feat_total_scored_mean.1', 'agent_1_feat_total_xg_mean.1', 'agent_1_feat_both_scored_mean.1', 'agent_2_feat_total_scored_mean', 'agent_2_feat_total_xg_mean', 'agent_2_feat_both_scored_mean', 'agent_2_feattotal_scored_mean', 'agent_2_feattotal_xg_mean', 'agent_2_featboth_scored_mean']


In [35]:
y = train_target['category']
X = train_data

In [36]:
# Handle NaN values
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
test_data_imputed = pd.DataFrame(imputer.transform(test_data), columns=test_data.columns)

In [37]:
# Scale the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)
test_scaled = scaler.transform(test_data_imputed)

In [38]:
# Feature selection using SVC with hyperparameter tuning
params = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2']
}
lsvc = LinearSVC(dual=False)
grid_search = GridSearchCV(lsvc, params, cv=5)
grid_search.fit(X_scaled, y)
model = SelectFromModel(grid_search.best_estimator_, prefit=True)
X_selected = model.transform(X_scaled)
test_selected = model.transform(test_scaled)

In [39]:
# Split the data for training and validation
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [40]:
# Train a logistic regression model with hyperparameter tuning
params = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
}
logreg = LogisticRegression(max_iter=5000)
grid_search_logreg = GridSearchCV(logreg, params, cv=5)
grid_search_logreg.fit(X_train, y_train)

In [41]:
# Evaluate the model
y_val_pred = grid_search_logreg.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

Validation Accuracy: 0.5425


In [13]:
# Make predictions on the test dataset
test_predictions = grid_search_logreg.predict(test_selected)

In [42]:
# Save predictions to CSV
result_df = pd.DataFrame({
    'id': test_data.index,
    'category': test_predictions
})

result_df.to_csv('test_preds.csv', index=False)