In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import Lasso

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

# read in dataset
df = pd.read_csv('train_classification_v1.csv')

# define attributes and target
X = df[[column for column in df.columns if column != "Target"]]
y = df['Target']

#Checking shape of X and y
print("Shape of X is {}, and shape of y is {}".format(X.shape, y.shape))

# Splitting the data between test and train samples
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

# Creating and training a simple linear regression model
logr = LogisticRegression(max_iter=10000)
logr.fit(X_train,y_train)

# print the score
print("Score on training set: {:.3f}".format(logr.score(X_train, y_train)))
print("Score on test set: {:.3f}".format(logr.score(X_test, y_test)))

# print the coefficient
coefficient = logr.coef_


odds = np.exp(coefficient)

# Cross-validation

# Calculating cross-validated R-squared scores 
simple_scores = cross_val_score(logr, X, y, cv=10, scoring='r2')
print(simple_scores)

print("The average test score is:", simple_scores.mean())
print("The standard deviation of the test scores is:", simple_scores.std())

# Set up the parameter grid - what values should we check for the polynomial degree?
param_grid = {
    'poly_features__degree': [1, 2, 3, 4, 5]
}

# Create the pipeline with PolynomialFeatures and LinearRegression
model = Pipeline([
    ('poly_features', PolynomialFeatures()),
    ('log_reg', LogisticRegression())
])

# Set up GridSearchCV with the model and parameter grid
grid_search = GridSearchCV(model, param_grid, cv=10, scoring='r2', n_jobs=-1)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Retrieve and print the scores for each iteration
cv_results = grid_search.cv_results_
for mean_score, params in zip(cv_results["mean_test_score"], cv_results["params"]):
    print(f"Mean R-squared for {params}: {mean_score}")

# Retrieve the best hyperparameters and the corresponding best estimator
best_degree = grid_search.best_params_['poly_features__degree']
best_model = grid_search.best_estimator_

# Print the best hyperparameters
print(f"Best Polynomial Degree: {best_degree}")

# Evaluate the best model on the test data
test_score = best_model.score(X_test, y_test)
print(f"Test R-squared with best model: {test_score}")


## More complex polynomial regression model 
#poly = PolynomialFeatures(degree=2)
#X_train_poly = poly.fit_transform(X_train)
#X_test_poly = poly.fit_transform(X_test)

#poly_logr = LogisticRegression()
#poly_logr.fit(X_train_poly,y_train)

#print("Score on training set: {:.3f}".format(poly_logr.score(X_train_poly, y_train)))
#print("Score on test set: {:.3f}".format(poly_logr.score(X_test_poly, y_test)))

# Remarque : essai de modèle polynomial mais trop lourd pour ordinateur, n'arrive pas à tourner ("dead kernel")
#max_iter car sinon convergence n'est pas atteinte. Seulement, le programme est pas mal ralenti