In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, r2_score

def selectkbest(indep_X, dep_Y, n):
    test = SelectKBest(score_func=chi2, k=n)
    fit1 = test.fit(indep_X, dep_Y)
    selectk_features = fit1.transform(indep_X)
    return selectk_features

def split_scalar(indep_X, dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)
    return X_train, X_test, y_train, y_test

def evaluate_classification_models(X_train, y_train, X_test, y_test):
    models = {
        'Logistic Regression': LogisticRegression(random_state=0),
        'Support Vector Machine (Linear)': SVC(kernel='linear', random_state=0),
        'Support Vector Machine (Non-Linear)': SVC(kernel='rbf', random_state=0),
        'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
        'Naive Bayes': GaussianNB(),
        'Decision Tree': DecisionTreeClassifier(random_state=0),
        'Random Forest': RandomForestClassifier(n_estimators=10, random_state=0)
    }
    results = {}
    for name, model in models.items():
        if name == 'Random Forest':
            param_grid = {'n_estimators': [50, 100, 150]}  # Define hyperparameters for grid search
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(X_test)
            results[name] = accuracy_score(y_test, y_pred)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            results[name] = accuracy_score(y_test, y_pred)
    return results

def evaluate_regression_models(X_train, y_train, X_test, y_test):
    models = {
        'Linear Regression': LinearRegression(),
        'Support Vector Machine (Linear)': SVR(kernel='linear'),
        'Support Vector Machine (Non-Linear)': SVR(kernel='rbf'),
        'Decision Tree': DecisionTreeRegressor(random_state=0),
        'Random Forest': RandomForestRegressor(n_estimators=10, random_state=0)
    }
    results = {}
    for name, model in models.items():
        if name == 'Random Forest':
            param_grid = {'n_estimators': [50, 100, 150]}  # Define hyperparameters for grid search
            grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
            grid_search.fit(X_train, y_train)
            best_model = grid_search.best_estimator_
            y_pred = best_model.predict(X_test)
            results[name] = r2_score(y_test, y_pred)
        else:
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            results[name] = r2_score(y_test, y_pred)
    return results

def selectk_evaluation(dataset_file, target_col, n_features):
    dataset = pd.read_csv(dataset_file)
    #df_encoded = pd.get_dummies(dataset, drop_first=True)
    
    indep_X = dataset.drop(target_col, axis=1)
    dep_Y = dataset[target_col]
    
    kbest_features = selectkbest(indep_X, dep_Y, n_features)
    X_train, X_test, y_train, y_test = split_scalar(kbest_features, dep_Y)
    
    classification_results = evaluate_classification_models(X_train, y_train, X_test, y_test)
    regression_results = evaluate_regression_models(X_train, y_train, X_test, y_test)
    
    return classification_results, regression_results

# Change these parameters as needed
dataset_file = "finaldata.csv"
target_col = "Possible"
n_features = 5

class_results, reg_results = selectk_evaluation(dataset_file, target_col, n_features)

# Display results in a table
classification_df = pd.DataFrame(class_results.items(), columns=['Model', 'Chi2'])
regression_df = pd.DataFrame(reg_results.items(), columns=['Model', 'R2'])

print("Classification Results:")
print(classification_df)

print("\nRegression Results:")
print(regression_df)


Classification Results:
                                 Model  Chi2
0                  Logistic Regression  0.96
1      Support Vector Machine (Linear)  0.95
2  Support Vector Machine (Non-Linear)  0.97
3                  K-Nearest Neighbors  0.89
4                          Naive Bayes  0.86
5                        Decision Tree  0.94
6                        Random Forest  0.98

Regression Results:
                                 Model        R2
0                    Linear Regression  0.600951
1      Support Vector Machine (Linear)  0.585310
2  Support Vector Machine (Non-Linear)  0.803288
3                        Decision Tree  0.739583
4                        Random Forest  0.893316
