In [8]:
#pip install xgboost

In [204]:
from preprocessing_utils import *
import warnings
import pandas as pd
# suppress warning messages
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier
from tqdm import tqdm
from time import sleep
# import gradient_boosting machine
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,confusion_matrix, classification_report
from dataprocessor import DataPreprocessor
from sklearn.model_selection import GridSearchCV




survey_data_path = "/Volumes/DeepLearner/MADS/Milestone_data/surveydata.csv"
data_5k_path = "/Volumes/DeepLearner/MADS/Milestone_data/data_5k_cleaned.csv"
config_path = "/Volumes/DeepLearner/MADS/Milestone2_Party_prediction/milestone2_MADS/preprocessing_config.json"

In [270]:
columns_to_use = [
    'AGE' ,'CNS_MEDINC', 'CNSUS_PCTA', 'CNSUS_PCTB',
    'CNSUS_PCTH', 'CNSUS_PCTI', 'CNSUS_PCTM', 'CNSUS_PCTO', 'CNSUS_PCTP', 'CNSUS_PCTW',
    'COUNTY_TYPE', 'EDUCATION', 'ETHNIC_INFER',
    'HOMEMKTVAL', 'INCOMESTHH', 'NETWORTH',
    'PARTY_CODE', 'PARTY_MIX', 'PRESENCHLD', 'PRFL_POLITICAL_IDEOLOGY',
    'RELIGION', 'SEX', 'CENSUS_ST', 
    'TOD_PRES_D_2016', 'TOD_PRES_D_2016_PREC', 'TOD_PRES_D_2020_PREC', 'TOD_PRES_O_2016',
    'TOD_PRES_O_2016_PREC', 'TOD_PRES_O_2020_PREC', 'TOD_PRES_R_2016', 'TOD_PRES_R_2016_PREC',
    'TOD_PRES_R_2020_PREC', 'R_DONOR', 'D_DONOR', 'VOTED_R_ELECTION', 'VOTED_D_ELECTION',
    'STATE_COUNTY_FIPS', 'STATE_LOWER_HOUSE', 'STATE_UPPER_HOUSE', 'CENSUS_TRACT','Q1_Candidate',
    'TOD_PRES_DIFF_2016','TOD_PRES_DIFF_2016_PREC','TOD_PRES_DIFF_2020_PREC', 'AI_COUNTY_NAME',
    'STATE', "GENDER_MIX", "GENERATION", "CREDRATE",
    "PRFL_BORDER_SECURITY", "PRFL_CHOICELIFE", "PRFL_CONSERVATIVE_NEWS", "PRFL_TRUMP_SUPPORT",
    "PRFL_LIBERAL_NEWS", "PRFL_IMMIGRATION_REFORM" , "PRFL_BIDEN_SUPPORT", "PRFL_HEALTHCARE_REFORM"
    , "VOTER_CNT", "VOTER_TRLR", "TRAIL_CNT", "VP_GEN", "VP_PPP", "VP_PRI", 'DON_POLIT',"LANGUAGE",
    "HH_SIZE",'ZIP']

#COLS REMOVED
# "LIFESTAGE_CLUSTER"
# "LENGTH_RES"
# , "PRFL_2NDAMEND"
# "LANGUAGE"
# "PRFL_MINWAGE"
#  "VP_OTH"
# PRFL_MINWAGE
# 'COUNTY_ST'
# 'HOMEOWNRNT'
# , 'DON_POLIT'
# "LANGUAGE",
#"HH_SIZE"
# 'ZIP',
#  "CENSUS_TRK",
# STATE_CD',


In [272]:

surveydata = pd.read_csv(survey_data_path, low_memory=False)

preprocessor_survey = DataPreprocessor(surveydata, config_path=config_path)
survey_df = preprocessor_survey.run_preprocessing_pipeline(skip_preprocess_dataframe=True, use_frequency_encoding=True, drop_converted_cols=True)

Loading config from /Volumes/DeepLearner/MADS/Milestone2_Party_prediction/milestone2_MADS/preprocessing_config.json
DataPreprocessor initialized.
Bleep bloop...
Applying config...


Standardizing missing values...
Imputing missing values...
Imputing missing values...
Imputing missing values...
Mapping categorical values...
Engineering new features...
Removing columns over the threshold...


## Counting votes + Interaction Terms

In [273]:
from itertools import groupby


def create_interaction_terms(df, pairs, interaction_type='cat'):
    for pair in pairs:
        feature1, feature2 = pair
        interaction_term_name = f"{feature1}_{feature2}_interaction"
        
        if interaction_type == 'cat':
            # Create the categorical interaction term
            df[interaction_term_name] = df[feature1].astype(str) + "_" + df[feature2].astype(str)
            
        elif interaction_type == 'num':
            # Assuming 'Y' is coded as 1 and 'N' as 0, if they are already not coded this way, you should convert them
            df[interaction_term_name] = df[feature1] + df[feature2]
            # If you want a point for the event they are both 'Y' or 1
            df[interaction_term_name] = df[interaction_term_name].apply(lambda x: 1 if x == 2 else 0)
            
    return df

def feature_engineering_voting_data(survey_df, columns_to_use, interaction_type='cat'):

    # Define the vote types
    democrat_votes = ['D', 'M', 'Z']
    republican_votes = ['R', 'P', 'X']
    early_votes = ['E', 'M', 'P']
    absentee_votes = ['A', 'Z', 'X']
    
    
    interaction_pairs = [
    ('PRFL_LIBERAL_NEWS', 'PRFL_IMMIGRATION_REFORM'), ('PRFL_CONSERVATIVE_NEWS', 'PRFL_BORDER_SECURITY')
    # pairs tried  ('PRFL_CONSERVATIVE_NEWS', 'PRFL_BORDER_SECURITY')
        ]
    

    # function to count specific vote types
    def count_votes(vote_counts, vote_types):
        return sum(vote_counts.get(vote_type, 0) for vote_type in vote_types)

    # function to count longest streak for a party
    def longest_streak(votes, party_votes):
        streaks = [sum(1 for _ in g) for k, g in groupby(votes) if k in party_votes]
        return max(streaks) if streaks else 0

    # count early and absentee votes
    survey_df['count_Early'] = survey_df.filter(like='VTR_GEN').apply(lambda row: count_votes(row.value_counts(), early_votes), axis=1)
    survey_df['count_Absentee'] = survey_df.filter(like='VTR_GEN').apply(lambda row: count_votes(row.value_counts(), absentee_votes), axis=1)
    columns_to_use.extend(['count_Early', 'count_Absentee'])

    for prefix in ['VTR_GEN', 'VTR_PPP', 'VTR_PRI']:
        survey_df[f'count_D_{prefix}'] = survey_df.filter(like=prefix).apply(lambda row: count_votes(row.value_counts(), democrat_votes), axis=1)
        survey_df[f'count_R_{prefix}'] = survey_df.filter(like=prefix).apply(lambda row: count_votes(row.value_counts(), republican_votes), axis=1)
        columns_to_use.extend([f'count_D_{prefix}', f'count_R_{prefix}'])

        # Count longest streak of consistent voting for each party
       # survey_df[f'longest_streak_D_{prefix}'] = survey_df.filter(like=prefix).apply(lambda row: longest_streak(row.tolist(), democrat_votes), axis=1)
       # survey_df[f'longest_streak_R_{prefix}'] = survey_df.filter(like=prefix).apply(lambda row: longest_streak(row.tolist(), republican_votes), axis=1)
        #columns_to_use.extend([f'longest_streak_D_{prefix}', f'longest_streak_R_{prefix}'])

    survey_df['recent_party_2022'] = survey_df['VTR_GEN22'].apply(lambda x: 'D' if x in democrat_votes else ('R' if x in republican_votes else 'Other'))
    #survey_df['recent_party_2020'] = survey_df['VTR_GEN18'].apply(lambda x: 'D' if x in democrat_votes else ('R' if x in republican_votes else 'Other'))
    columns_to_use.extend(['recent_party_2022'])

    survey_df['total_votes'] = survey_df.filter(like='VTR_').apply(lambda row: sum(row != 'N'), axis=1)
    columns_to_use.append('total_votes')

    survey_df = create_interaction_terms(survey_df, interaction_pairs, interaction_type)
    
    # Make sure to add the new interaction term columns to your columns_to_use list
    new_columns = [f"{pair[0]}_{pair[1]}_interaction" for pair in interaction_pairs]
    columns_to_use.extend(new_columns)

    
    return survey_df, columns_to_use

survey_df, columns_to_use = feature_engineering_voting_data(survey_df, columns_to_use)

In [274]:
def target_mean_encoding(df, columns_to_encode, target_col, columns_to_use):
    """
    Perform target mean encoding on specified columns.

    Parameters:
    - df: DataFrame to be encoded
    - columns_to_encode: List of columns to apply target encoding
    - target_col: The target column for encoding

    Returns:
    - df_encoded: DataFrame with target mean encoded columns
    """
    df_encoded = df.copy()
    global_mean = df[target_col].mean()
    
    for column in columns_to_encode:
        mean_values = df.groupby(column)[target_col].mean()
        new_column_name = f"{column}_mean_encoded"
        df_encoded[new_column_name] = df[column].map(mean_values)
        columns_to_use.append(new_column_name)
        
    # Move fillna outside the loop to fill NaNs for all encoded columns at once
    df_encoded.fillna({f"{col}_mean_encoded": global_mean for col in columns_to_encode}, inplace=True)
        
    return df_encoded, columns_to_use

#label_encoder = LabelEncoder()
#survey_df['Q1_Candidate'] = label_encoder.fit_transform(survey_df['Q1_Candidate'].astype(str))

#target_col = 'Q1_Candidate'
#columns_to_encode = ["PRFL_CHOICELIFE", "CENSUS_TRK"]

#survey_df, columns_to_use = target_mean_encoding(survey_df, columns_to_encode, target_col, columns_to_use)

In [275]:
# check dtypes of total_votes
#print(survey_df.total_votes.dtypes)

def count_switches(row):
    switch_count = 0
    prev_vote = None
    for vote in row:
        if prev_vote in democrat_votes and vote in republican_votes:
            switch_count += 1
        elif prev_vote in republican_votes and vote in democrat_votes:
            switch_count += 1
        prev_vote = vote
    return switch_count

survey_df['count_switches'] = survey_df.filter(like='VTR_GEN').apply(count_switches, axis=1)


## Adding splitting out TOD

In [276]:
survey_df = survey_df[columns_to_use]

#survey_df['VP_OTH'] = survey_df['VP_OTH'].round().astype(int)
#survey_df['VP_OTH'] = survey_df['VP_OTH'].astype(str)


# convert LENGTH_RES to numeric
#survey_df['LENGTH_RES'] = survey_df['LENGTH_RES'].astype(float)


In [277]:
split_cols = ['TOD_PRES_DIFF_2016', 'TOD_PRES_DIFF_2016_PREC', 'TOD_PRES_DIFF_2020_PREC']
for col in split_cols:
    #print(survey_df[col].dtype)
    survey_df[col + '_num'] = survey_df[col].str.extract('(\d+)').astype('float')
    survey_df[col + '_party'] = survey_df[col].str.extract('([RD])')
    # append these new columsn to "features_to_use"
   # features_to_use.append(col + '_num')
   # features_to_use.append(col + '_party')
    columns_to_use.append(col + '_num')
    columns_to_use.append(col + '_party')

# convert split_cols to int


#survey_df[split_cols] = survey_df[split_cols].astype(int)

# drop the split_cols
#survey_df.drop(columns=split_cols, inplace=True)

freq_cols = ['ZIP', 'STATE', 'AI_COUNTY_NAME']

cols_tried = ["LIFESTAGE_CLUSTER", 'COUNTY_ST']

for col in freq_cols:
    freq_map = survey_df[col].value_counts(normalize=True)
    survey_df[col + '_freq'] = survey_df[col].map(freq_map)
    columns_to_use.append(col + '_freq')



In [278]:
#survey_df = survey_df[columns_to_use]

label_encoder = LabelEncoder()
survey_df['Q1_Candidate'] = label_encoder.fit_transform(survey_df['Q1_Candidate'].astype(str))

categorical_columns = survey_df.select_dtypes(include=['object', 'category']).columns.tolist()

label_encoders = {}
for categorical_col in categorical_columns:
    # Convert NaNs to 'missing' and ensure all data is of string type
    survey_df.loc[:, categorical_col] = survey_df[categorical_col].fillna('missing').astype(str)
    
    le = LabelEncoder()
    survey_df[categorical_col] = le.fit_transform(survey_df[categorical_col])
    label_encoders[categorical_col] = le


In [279]:
# checking dtype
#print(df_survey_processed.STATE_LOWER_HOUSE.dtypes), print(df_survey_processed.VOTED_D_ELECTION.dtypes), print(df_survey_processed.CENSUS_TRACT.dtypes)

In [280]:
#df_survey_processed["Q1_Candidate"].value_counts(), df_survey_processed.shape

In [283]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin
from typing import List
from sklearn.cluster import KMeans

class TargetMeanEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
        self.mean_map = {}
        
    def fit(self, X, y):
        X_copy = X.copy()
        X_copy['target'] = y
        for col in self.columns:
            self.mean_map[col] = X_copy.groupby(col)['target'].mean()
        return self
    
    def transform(self, X):
        X_copy = X.copy()
        for col in self.columns:
            X_copy[col] = X_copy[col].map(self.mean_map[col])
        return X_copy

def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on various metrics and return a dictionary of results.
    """
    y_pred = model.predict(X_test)
    return {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred, average='weighted'),
        'Recall': recall_score(y_test, y_pred, average='weighted'),
        'F1 Score': f1_score(y_test, y_pred, average='weighted')
    }

def plot_confusion_matrix(y_true, y_pred, title=''):
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title(title)
    plt.show()#

def plot_logistic_regression_coefficients(model: BaseEstimator, feature_names: List[str], 
                                                      title: str = 'Logistic Regression Coefficients'):
    """
    Plot the coefficients from a trained Logistic Regression model as an interactive vertical bar chart.
    
    Parameters:
        model (BaseEstimator): The trained model.
        feature_names (List[str]): List of feature names.
        title (str, optional): Title of the plot. Defaults to 'Logistic Regression Coefficients'.
    """
    
    # Extract coefficients from the logistic regression model
    coef = model.coef_[0]
    
    # Sort coefficients and feature names by the absolute values of coefficients
    sorted_indices = np.argsort(np.abs(coef))[::-1]
    sorted_coef = coef[sorted_indices]
    sorted_feature_names = [feature_names[i] for i in sorted_indices]
    
    # Create DataFrame for Plotly
    df = pd.DataFrame({
        'Feature': sorted_feature_names,
        'Coefficient': sorted_coef
    })
    
    # Create interactive vertical bar chart
    fig = px.bar(df, y='Feature', x='Coefficient', orientation='h', text='Coefficient')
    
    # Customize appearance
    fig.update_layout(
        title=title,
        xaxis_title='Coefficient Value',
        yaxis_title='Feature',
        yaxis_categoryorder='total ascending'
    )
    
    # Show the plot
    fig.show()

def plot_feature_importance(model: BaseEstimator, feature_names: List[str], 
                                        title: str = 'Feature Importances', n_features: int = None):
    """
    Plot the feature importances from a trained machine learning model as an interactive vertical bar chart.
    
    Parameters:
        model (BaseEstimator): The trained model.
        feature_names (List[str]): List of feature names.
        title (str, optional): Title of the plot. Defaults to 'Feature Importances'.
        n_features (int, optional): Number of top features to display. If None, all features are displayed. Defaults to None.
    """
    
    try:
        importances = model.feature_importances_
    except AttributeError:
        print(f"{title} does not support feature importances.")
        return
    
    sorted_indices = np.argsort(importances)[::-1]
    if n_features:
        sorted_indices = sorted_indices[:n_features]
    sorted_importances = importances[sorted_indices]
    sorted_feature_names = [feature_names[i] for i in sorted_indices]
    
    df = pd.DataFrame({
        'Feature': sorted_feature_names,
        'Importance': sorted_importances
    })
    
    fig = px.bar(df, y='Feature', x='Importance', orientation='h', text='Importance')
    
    fig.update_layout(
        title=title,
        xaxis_title='Importance Value',
        yaxis_title='Feature',
        yaxis_categoryorder='total ascending'
    )
    
    fig.show()

class KMeansTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, n_clusters):
        self.n_clusters = n_clusters
        self.kmeans = KMeans(n_clusters=self.n_clusters)
    
    def fit(self, X, y=None):
        self.kmeans.fit(X)
        return self
    
    def transform(self, X):
        labels = self.kmeans.predict(X)
        return np.c_[X, labels]


from sklearn.decomposition import PCA
def prepare_and_fit_model(df, target_col, model_type='All', feature_importance=False, 
                          scale_data=False, evaluate_models=False, use_grid_search=False, 
                          feature_engineering=None):
    print("Starting data preparation...")

    param_grids = {
        'RF': {
            'classifier__n_estimators': [50, 100, 200, 300],
            'classifier__max_depth': [None, 10, 20, 30, 40],
            'classifier__min_samples_split': [2, 5, 10, 15],
            'classifier__min_samples_leaf': [1, 2, 5]
        },
        'XGBoost': {
            'classifier__n_estimators': [50, 100, 200, 300],
            'classifier__learning_rate': [0.001, 0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 4, 5, 6],
            'classifier__subsample': [0.7, 0.8, 0.9, 1.0],
            'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
        },
        'GBC': {
            'classifier__n_estimators': [50, 100, 200, 300],
            'classifier__learning_rate': [0.001, 0.01, 0.1, 0.2],
            'classifier__max_depth': [3, 4, 5, 6],
            'classifier__min_samples_split': [2, 3, 4, 5],
            'classifier__min_samples_leaf': [1, 2, 3, 4]
        },
        'LogisticRegression': {
            'classifier__penalty': ['l1', 'l2', 'elasticnet', 'none'],
            'classifier__C': [0.001, 0.01, 0.1, 0.5, 1, 5, 10],
            'classifier__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
        },
        'SVM': {
            'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],
            'classifier__kernel': ['rbf', 'linear', 'poly', 'sigmoid'],
            'classifier__gamma': ['scale', 'auto']
        }
    }


    df = df.drop(columns=['Unnamed: 0', "RECORD_ID"], errors='ignore')
    # Identify the prefix of the chosen target column
    target_prefix = "_".join(target_col.split("_")[:-1])
    
    # Drop all other questionnaire columns
    other_q_cols = [col for col in df.columns if col.startswith("Q") and not col.startswith(target_prefix)]
    df = df.drop(columns=other_q_cols)
    
    if not np.issubdtype(df[target_col].dtype, np.number):
        le = LabelEncoder()
        df[target_col] = le.fit_transform(df[target_col])

    num_features = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    cat_features = df.select_dtypes(include=['object']).columns.tolist()

    if target_col in num_features: 
        num_features.remove(target_col)
    if target_col in cat_features: 
        cat_features.remove(target_col)

    num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])
    cat_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])

    
    kmeans_n_clusters = 3  # or any other number based on your specific case
    feature_engineering_steps = []

    if feature_engineering == 'KMeans':
        feature_engineering_steps.append(('kmeans', KMeansTransformer(n_clusters=kmeans_n_clusters)))
    elif feature_engineering == 'PCA':
        feature_engineering_steps.append(('pca', PCA(n_components=2)))  # You can change n_components based on your needs
    elif feature_engineering == 'BOTH':
        feature_engineering_steps.append(('kmeans', KMeansTransformer(n_clusters=kmeans_n_clusters)))
        feature_engineering_steps.append(('pca', PCA(n_components=2)))  # You can change n_components based on your needs

    # Update num_transformer pipeline to include feature engineering steps
    if scale_data:
        num_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ] + feature_engineering_steps)
    else:
        num_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='mean'))
        ] + feature_engineering_steps)

    # Your existing ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', num_transformer, num_features),
            ('cat', cat_transformer, cat_features)
        ])

    models = {}
    model_types_to_fit = []

    # Adding more classifiers
    if model_type in ['RF', 'All']: 
        model_types_to_fit.append('RF')
    if model_type in ['XGBoost', 'All']: 
        model_types_to_fit.append('XGBoost')
    if model_type in ['LR', 'All']: 
        model_types_to_fit.append('LogisticRegression')
    if model_type in ['SVM', 'All']: 
        model_types_to_fit.append('SVM')
    if model_type in ['GBC', 'All']:  # Add this line for Gradient Boosting Classifier
        model_types_to_fit.append('GBC')

    print("Starting model training...")
    evaluation_results = {}

    X = df.drop(columns=[target_col])
    y = df[target_col]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    #columns_to_encode = ["PRFL_CHOICELIFE", "CENSUS_TRK"]  # example columns
    #X_train, num_features = target_mean_encoding(X_train.join(y_train), columns_to_encode, target_col, num_features)

    # Apply mean encoding on the test set based on training set data
    #global_mean = y_train.mean()
    #for col in columns_to_encode:
    #    mean_values = X_train.groupby(col)[f"{col}_mean_encoded"].mean()
    #    X_test[f"{col}_mean_encoded"] = X_test[col].map(mean_values)
    #    X_test[f"{col}_mean_encoded"].fillna(global_mean, inplace=True)

    # Initialize an empty dictionary to store the trained models
    models = {}
    classification_reports_dict = {}
    # Loop through the specified models to fit
    for model_type in tqdm(model_types_to_fit):
        clf = None  # Reset classifier

        if model_type == 'RF':
            clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', RandomForestClassifier(random_state=42))])
        elif model_type == 'XGBoost':
            clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', XGBClassifier(random_state=42))])
        elif model_type == 'GBC':
            clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', GradientBoostingClassifier(random_state=42))])
        elif model_type == 'LogisticRegression':
            clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', LogisticRegression(penalty='l1', solver='liblinear', C=1.0, random_state=42))])
        elif model_type == 'SVM':
            clf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier',  SVC(C=1.0, kernel='rbf', random_state=42))])
        best_params_dict = {}

        if clf:
            if use_grid_search:
                grid_search = GridSearchCV(clf, param_grids[model_type], cv=5, verbose=3, n_jobs=-1)
                grid_search.fit(X_train, y_train)
                clf = grid_search.best_estimator_
                print(f"Best parameters for {model_type}: {grid_search.best_params_}")
                #if 'kmeans_cluster_label' not in num_features:
                  #  num_features.append('kmeans_cluster_label')

                best_params_dict[model_type] = grid_search.best_params_
                print(f"Best parameters for {model_type}: {grid_search.best_params_}")

                models[model_type] = clf  # the best estimator is already fitted
                print(f"{model_type} model trained.")
            else:
                clf.fit(X_train, y_train)
                models[model_type] = clf
                print(f"{model_type} model trained.")

            if evaluate_models:
                eval_metrics = evaluate_model(clf, X_test, y_test)
                evaluation_results[model_type] = eval_metrics
                print(f"Evaluation metrics for {model_type}: {eval_metrics}")
                #print(f"Classification Report for {model_type}:\n{report_str}")

                y_pred = clf.predict(X_test)
                report = classification_report(y_test, y_pred, output_dict=True)
                classification_reports_dict[model_type] = report
                report_str = classification_report(y_test, y_pred)
                print(f"Classification Report for {model_type}:\n{report}")

            if feature_importance:
                feature_names = num_features + cat_features
                feature_names.append('kmeans_cluster')  # Add the new KMeans feature name
                if model_type in ['RF', 'XGBoost', 'GBC']:
                    plot_feature_importance(clf.named_steps['classifier'], feature_names, title=f'{model_type} - Feature Importance')
                elif model_type == 'LogisticRegression':
                    plot_logistic_regression_coefficients(clf.named_steps['classifier'], feature_names, title=f'{model_type} - Coefficients')
                else:
                    print(f"Feature importance not available for {model_type}")


    print("Model training complete.")
    if evaluate_models:
        return models, evaluation_results, classification_reports_dict, best_params_dict
    else:
        return models

In [284]:
import os
import json
def get_next_experiment_id(log_file):
    if os.path.exists(log_file) and os.path.getsize(log_file) > 0:
        try:
            experiment_log = pd.read_csv(log_file)
            last_exp_id = experiment_log['exp_id'].iloc[-1]
            next_exp_id = f"experiment_{int(last_exp_id.split('_')[1]) + 1}"
            print(f"Next exp_id generated: {next_exp_id}")  # Debug print
        except pd.errors.EmptyDataError:
            next_exp_id = "experiment_1"
    else:
        next_exp_id = "experiment_1"
    return next_exp_id

def log_experiment_results_csv(log_file, exp_id=None, evaluation_results=None, notes={}):
    if exp_id is None:
        exp_id = get_next_experiment_id(log_file)
        
    if os.path.exists(log_file) and os.path.getsize(log_file) > 0:
        try:
            experiment_log = pd.read_csv(log_file)
        except pd.errors.EmptyDataError:
            experiment_log = pd.DataFrame(columns=['exp_id', 'evaluation_results', 'notes'])
    else:
        experiment_log = pd.DataFrame(columns=['exp_id', 'evaluation_results', 'notes'])
        
    new_row = {'exp_id': exp_id, 'evaluation_results': str(evaluation_results), 'notes': str(notes)}
    experiment_log = experiment_log.append(new_row, ignore_index=True)
    
    experiment_log.to_csv(log_file, index=False)

log_file = "experiment_logs.csv"
notes = {"description": "Added back TODO transfrom and freq encoding (STATE, COUNTY) + added GENERMIX, GENERATION, HH_SIZE, CREDRATE, LANGUAGE, 2ndAMEND, BORDER, CHOICELIFE, CONSERVNEWS, LIBNEWS. Scaled", "target_col": "Q1_Candidate"}

#df_survey_processed = preprocessor_survey.run_preprocessing_pipeline(skip_preprocess_dataframe=False, drop_converted_cols=True, use_frequency_encoding=True)
models, evaluation_results, class_report, best_params = prepare_and_fit_model(
    survey_df, 'Q1_Candidate', model_type='All', feature_importance=True, 
    scale_data=True, evaluate_models=True, use_grid_search=True
)

class_report_str = json.dumps(class_report, indent=4)

best_params_str = json.dumps(best_params, indent=4)


# Update your notes dictionary
notes = {
    "description": "Your experiment description here",
    "target_col": "Q1_Candidate",
    "columns_used": ", ".join(columns_to_use),
    "classification_report": class_report_str,
    "best_params": best_params_str  # Add the best parameters here
}
log_experiment_results_csv(log_file,None, evaluation_results, notes)

Starting data preparation...
Starting model training...


  0%|          | 0/5 [00:00<?, ?it/s]

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV 1/5] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50;, score=0.523 total time=   0.2s
[CV 4/5] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50;, score=0.560 total time=   0.2s
[CV 5/5] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50;, score=0.577 total time=   0.2s
[CV 2/5] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50;, score=0.551 total time=   0.2s
[CV 3/5] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_split=2, classifier__n_estimators=50;, score=0.551 total time=   0.2s
[CV 3/5] END classifier__max_depth=None, classifier__min_samples_leaf=1, classifier__min_samples_spli

 20%|██        | 1/5 [01:23<05:33, 83.45s/it]

Fitting 5 folds for each of 1280 candidates, totalling 6400 fits
[CV 3/5] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.001, classifier__max_depth=3, classifier__n_estimators=50, classifier__subsample=0.7;, score=0.606 total time=   0.1s
[CV 2/5] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.001, classifier__max_depth=3, classifier__n_estimators=50, classifier__subsample=0.7;, score=0.551 total time=   0.1s
[CV 1/5] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.001, classifier__max_depth=3, classifier__n_estimators=50, classifier__subsample=0.7;, score=0.528 total time=   0.1s
[CV 4/5] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.001, classifier__max_depth=3, classifier__n_estimators=50, classifier__subsample=0.7;, score=0.588 total time=   0.1s
[CV 5/5] END classifier__colsample_bytree=0.6, classifier__learning_rate=0.001, classifier__max_depth=3, classifier__n_estimators=50, classifier__subsample=0.7;, s

 40%|████      | 2/5 [18:40<32:13, 644.34s/it]

Fitting 5 folds for each of 140 candidates, totalling 700 fits
[CV 1/5] END classifier__C=0.001, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.001, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.001, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.001, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.001, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.001, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.001, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.001, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 4

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 5/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear;, score=0.553 total time=   0.0s
[CV 1/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 3/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=sag;, score=0.532 total time=   0.2s
[CV 4/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=lbfgs;, score=0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END classifier__C=0.01, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.530 total time=   0.0s
[CV 3/5] END classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear;, score=0.569 total time=   0.0s
[CV 1/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=saga;, score=0.528 total time=   0.2s
[CV 5/5] END classifier__C=0.01, classifier__penalty=l2, classifier__solver=sag;, score=0.530 total time=   0.0s
[CV 1/5] END classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear;, score=0.514 total time=   0.0s
[CV 4/5] END classifier__C=0.01, classifier__penalty=l2, classifier__solver=liblinear;, score=0.593 total time=   0.0s
[CV 3/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=saga;, score=0.532 total time=   0.2s
[CV 3/5] END classifier__C=0.01, classifier__penalty=elasticnet, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.01, classifier__penalty=elasti



[CV 3/5] END classifier__C=0.01, classifier__penalty=l2, classifier__solver=saga;, score=0.569 total time=   0.1s
[CV 4/5] END classifier__C=0.01, classifier__penalty=elasticnet, classifier__solver=sag;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.01, classifier__penalty=elasticnet, classifier__solver=sag;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.01, classifier__penalty=l2, classifier__solver=sag;, score=0.569 total time=   0.1s
[CV 1/5] END classifier__C=0.01, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.01, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.01, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=saga;, score=0.565 total time=   0.2s
[CV 4/5] END classifier__C=0.01, classifier__penalty=elast

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=saga;, score=0.516 total time=   0.2s
[CV 1/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=saga;, score=0.602 total time=   0.2s
[CV 5/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=lb



[CV 5/5] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=saga;, score=0.553 total time=   0.2s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=liblinear;, score=0.593 total time=   0.1s
[CV 2/5] END classifier__C=0.01, classifier__penalty=none, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 4/5] END classifier__C=0.01, classifier__penalty=none, classifier__solver=sag;, score=0.560 total time=   0.2s
[CV 1/5] END classifier__C=0.01, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.495 total time=   0.0s
[CV 4/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=liblinear;, score=0.593 total time=   0.0s
[CV 2/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=sag;, 



[CV 1/5] END classifier__C=0.01, classifier__penalty=none, classifier__solver=saga;, score=0.468 total time=   0.2s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear;, score=0.569 total time=   0.1s
[CV 5/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=saga;, score=0.553 total time=   0.3s
[CV 4/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear;, score=0.597 total time=   0.0s
[CV 1/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.481 total time=   0.1s
[CV 2/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=saga;, score=0.565 total time=   0.2s[CV 5/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=liblinear;, score=0.526 total time=   0.1s

[CV 2/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.481 total time=   0.1s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l2, classifier__



[CV 5/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.507 total time=   0.1s
[CV 1/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.481 total time=   0.0s
[CV 2/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=sag;, score=0.481 total time=   0.1s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l1, classifier__solver=saga;, score=0.593 total time=   0.3s
[CV 2/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.481 total time=   0.0s
[CV 3/5] END classifier__C=0.01, classifier__penalty=none, classifier__solver=saga;, score=0.532 total time=   0.2s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=sag;, score=0.542 total time=   0.2s
[CV 4/5] END classifier__C=0.01, classifier__penalty=none, classifier__solver=saga;, score=0.565 total time=   0.2s
[CV 4/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=saga;, sco



[CV 5/5] END classifier__C=0.1, classifier__penalty=elasticnet, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.1, classifier__penalty=elasticnet, classifier__solver=sag;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.1, classifier__penalty=elasticnet, classifier__solver=sag;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.1, classifier__penalty=elasticnet, classifier__solver=sag;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.1, classifier__penalty=elasticnet, classifier__solver=sag;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.1, classifier__penalty=elasticnet, classifier__solver=sag;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.01, classifier__penalty=none, classifier__solver=saga;, score=0.516 total time=   0.2s
[CV 1/5] END classifier__C=0.1, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.1, classifier__penalty

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 3/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=lbfgs;, score=0.528 total time=   0.1s
[CV 1/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=sag;, score=0.481 total time=   0.1s
[CV 4/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=lbfgs;, score=0.560 total time=   0.1s
[CV 1/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=saga;, score=0.477 total time=   0.2s
[CV 5/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=lbfgs;, score=0.516 total time=   0.1s
[CV 1/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=saga;, score=0.481 total time=   0.2s
[CV 2/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=sag;, score=0.481 total time=   0.3s
[CV 2/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=saga;, score=0.486 total time=   0.2s
[CV 3/5] END classifier__C=0.1, classifier__penalty=l2, classifier__solver=saga;, score=0.542 total time=   0.2s




[CV 3/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=sag;, score=0.532 total time=   0.2s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=saga;, score=0.532 total time=   0.2s
[CV 4/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=sag;, score=0.560 



[CV 4/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=saga;, score=0.565 total time=   0.2s
[CV 1/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.477 total time=   0.2s
[CV 5/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=sag;, score=0.516 total time=   0.2s
[CV 2/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.500 total time=   0.1s
[CV 5/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=saga;, score=0.516 total time=   0.2s
[CV 1/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solv



[CV 2/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=newton-cg;, score=0.468 total time=   3.4s
[CV 3/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.560 total time=   0.2s
[CV 1/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=saga;, score=0.468 total time=   0.3s




[CV 4/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.583 total time=   0.1s
[CV 4/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.565 total time=   0.1s
[CV 1/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=newton-cg;, score=0.472 total time=   3.7s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.516 total time=   0.1s
[CV 1/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=saga;, score=0.463 total time=   0.4s
[CV 2/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.491 total time=   0.1s
[CV 1/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.477 total time=   0.1s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.521 total time=   0.3s
[CV 1/5] END classifier__C=0.5, classifier__penalty=l1, class

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 3/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.532 total time=   0.1s
[CV 4/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.569 total time=   0.1s
[CV 4/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.565 total time=   0.1s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.512 total time=   0.1s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=sag;, score=0.516 total time=   0.2s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.516 total time=   0.1s
[CV 2/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=saga;, score=0.505 total time=   0.3s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.481 total time=   0.1s
[CV 3/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 4/5] END classifier__C=0.5, classi



[CV 5/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=sag;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=saga;, score=0.546 total time=   0.3s
[CV 3/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 4/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=saga;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.01, classifier__penalty=none, cla



[CV 3/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=sag;, score=0.532 total time=   0.2s
[CV 1/5] END classifier__C=0.01, classifier__penalty=none, classifier__solver=newton-cg;, score=0.472 total time=   4.0s
[CV 4/5] END classifier__C=0.5, classifier__penalty=l1, classifier__solver=saga;, score=0.574 total time=   0.3s
[CV 3/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=saga;, score=0.528 total time=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=sag;, score=0.560 total time=   0.2s
[CV 5/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=lbfgs;, score=0.516 total time=   0.0s
[CV 1/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=saga;, score=0.556 total time=   0.2s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l1, classifier__s



[CV 1/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.477 total time=   0.1s
[CV 1/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 5/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=saga;, score=0.512 total time=   0.2s
[CV 1/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=elasticnet, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 1/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=1, classifier__penalty=l1, classifier_



[CV 5/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=sag;, score=0.532 total time=   0.1s
[CV 2/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.481 total time=   0.2s
[CV 1/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=0.5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.532 total



[CV 5/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=sag;, score=0.516 total time=   0.2s




[CV 1/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=saga;, score=0.477 total time=   0.3s
[CV 1/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=liblinear;, score=0.491 total time=   0.5s
[CV 1/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=saga;, score=0.468 total time=   0.2s
[CV 2/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=liblinear;, score=0.481 total time=   0.2s
[CV 2/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=saga;, score=0.491 total time=   0.3s
[CV 2/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=saga;, score=0.486 total time=   0.2s




[CV 3/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=saga;, score=0.546 total time=   0.2s
[CV 1/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=newton-cg;, score=0.472 total time=   4.1s
[CV 3/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=saga;, score=0.532 total time=   0.3s
[CV 3/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=liblinear;, score=0.551 total time=   0.3s




[CV 4/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=saga;, score=0.565 total time=   0.2s
[CV 4/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=liblinear;, score=0.583 total time=   0.3s
[CV 4/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=saga;, score=0.565 total time=   0.5s
[CV 5/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=saga;, score=0.516 total time=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 3/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.532 total time=   0.1s
[CV 4/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.565 total time=   0.0s
[CV 3/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=newton-cg;, score=0.537 total time=   3.7s
[CV 5/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.521 total time=   0.1s
[CV 5/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=saga;, score=0.512 total time=   0.3s[CV 1/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=liblinear;, score=0.500 total time=   0.1s

[CV 1/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.491 total time=   0.1s
[CV 2/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=liblinear;, score=0.491 total time=   0.1s
[CV 5/5] END classifier__C=1, classifier__penalty=l1, classifier__solver=liblinear;

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 1/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 5/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=liblinear;, score=0.512 total time=   0.1s
[CV 4/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.565 total time=   0.1s
[CV 5/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.521 total time=   0.1s
[CV 2/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 1/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.491 total time=   0.1s
[CV 4/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=saga;, score=0.551 total time=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 2/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.481 total time=   0.0s
[CV 2/5] END classifier__C=1, classifier__penalty=elasticnet, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=1, classifier__penalty=elasticnet, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=1, classifier__penalty=elasticnet, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=1, classifier__penalty=elasticnet, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=1, classifier__penalty=elasticnet, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=1, classifier__penalty=elasticnet, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=1, classifier__penalty=elasticnet, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=1, classifier__penalty=

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 1/5] END classifier__C=1, classifier__penalty=none, classifier__solver=lbfgs;, score=0.468 total time=   0.1s
[CV 2/5] END classifier__C=1, classifier__penalty=none, classifier__solver=lbfgs;, score=0.477 total time=   0.1s
[CV 4/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=sag;, score=0.569 total time=   0.2s
[CV 3/5] END classifier__C=1, classifier__penalty=none, classifier__solver=lbfgs;, score=0.528 total time=   0.1s
[CV 4/5] END classifier__C=1, classifier__penalty=none, classifier__solver=lbfgs;, score=0.560 total time=   0.1s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 5/5] END classifier__C=1, classifier__penalty=none, classifier__solver=lbfgs;, score=0.516 total time=   0.1s
[CV 1/5] END classifier__C=1, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=1, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=sag;, score=0.516 total time=   0.2s
[CV 3/5] END classifier__C=1, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=1, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=1, classifier__penalty=none, classifier__solver=liblinear;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=newton-cg;, score=0.468 total time=   3.6s
[CV 1/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=sag



[CV 1/5] END classifier__C=1, classifier__penalty=none, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 2/5] END classifier__C=1, classifier__penalty=none, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 2/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=saga;, score=0.481 total time=   0.2s




[CV 1/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=newton-cg;, score=0.472 total time=   4.1s
[CV 3/5] END classifier__C=1, classifier__penalty=none, classifier__solver=sag;, score=0.532 total time=   0.1s
[CV 3/5] END classifier__C=1, classifier__penalty=l2, classifier__solver=saga;, score=0.528 total time=   0.2s
[CV 4/5] END classifier__C=1, classifier__penalty=none, classifier__solver=sag;, score=0.560 total time=   0.1s
[CV 2/5] END classifier__C=1, classifier__penalty=none, classifier__solver=saga;, score=0.486 total time=   0.2s




[CV 5/5] END classifier__C=1, classifier__penalty=none, classifier__solver=sag;, score=0.516 total time=   0.1s
[CV 3/5] END classifier__C=1, classifier__penalty=none, classifier__solver=saga;, score=0.532 total time=   0.2s




[CV 1/5] END classifier__C=1, classifier__penalty=none, classifier__solver=saga;, score=0.468 total time=   0.2s
[CV 1/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=1, classifier__penalty=none, classifier__solver=saga;, score=0.516 total time=   0.3s
[CV 4/5] END classifier__C=1, classifier__penalty=none, classifier__solver=saga;, score=0.565 total time=   0.4s
[CV 4/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=newton-cg;, score=nan total time=   0.0s
[CV 1/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=lbfgs;, score=nan tota



[CV 4/5] END classifier__C=0.001, classifier__penalty=none, classifier__solver=newton-cg;, score=0.574 total time=   3.1s




[CV 2/5] END classifier__C=0.1, classifier__penalty=none, classifier__solver=newton-cg;, score=0.468 total time=   4.0s
[CV 1/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.500 total time=   1.2s
[CV 2/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.491 total time=   1.8s
[CV 4/5] END classifier__C=0.01, classifier__penalty=none, classifier__solver=newton-cg;, score=0.574 total time=   3.3s




[CV 3/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.551 total time=   2.1s
[CV 1/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 2/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 3/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 4/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s
[CV 5/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=sag;, score=nan total time=   0.0s




[CV 1/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=saga;, score=0.472 total time=   0.3s
[CV 4/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.583 total time=   1.4s
[CV 3/5] END classifier__C=0.5, classifier__penalty=none, classifier__solver=newton-cg;, score=0.537 total time=   3.7s




[CV 1/5] END classifier__C=1, classifier__penalty=none, classifier__solver=newton-cg;, score=0.472 total time=   4.2s
[CV 2/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=saga;, score=0.486 total time=   0.3s
[CV 3/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=saga;, score=0.523 total time=   0.3s
[CV 5/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=liblinear;, score=0.516 total time=   1.3s
[CV 4/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=saga;, score=0.551 total time=   0.3s
[CV 1/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.468 total time=   0.2s
[CV 2/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.477 total time=   0.2s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

[CV 5/5] END classifier__C=5, classifier__penalty=l1, classifier__solver=saga;, score=0.507 total time=   0.4s
[CV 3/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.532 total time=   0.1s
[CV 4/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.560 total time=   0.1s
[CV 1/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.468 total time=   0.0s
[CV 5/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=newton-cg;, score=0.516 total time=   0.2s
[CV 2/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.481 total time=   0.1s
[CV 3/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.532 total time=   0.0s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[CV 4/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.569 total time=   0.1s
[CV 5/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=lbfgs;, score=0.516 total time=   0.1s
[CV 1/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.491 total time=   0.1s
[CV 2/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.491 total time=   0.1s
[CV 3/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.551 total time=   0.1s
[CV 4/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.565 total time=   0.1s
[CV 5/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=liblinear;, score=0.526 total time=   0.1s
[CV 1/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=sag;, score=0.477 total time=   0.2s




[CV 2/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=sag;, score=0.481 total time=   0.2s
[CV 3/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=sag;, score=0.532 total time=   0.2s
[CV 5/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=sag;, score=0.516 total time=   0.1s




[CV 4/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=sag;, score=0.565 total time=   0.3s
[CV 1/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=saga;, score=0.468 total time=   0.2s
[CV 2/5] END classifier__C=5, classifier__penalty=l2, classifier__solver=saga;, score=0.486 total time=   0.2s




In [18]:
models

{'LogisticRegression': Pipeline(steps=[('preprocessor',
                  ColumnTransformer(transformers=[('num',
                                                   Pipeline(steps=[('imputer',
                                                                    SimpleImputer()),
                                                                   ('scaler',
                                                                    StandardScaler())]),
                                                   ['AGE', 'CNS_MEDINC',
                                                    'CNSUS_PCTA', 'CNSUS_PCTB',
                                                    'CNSUS_PCTH', 'CNSUS_PCTI',
                                                    'CNSUS_PCTM', 'CNSUS_PCTO',
                                                    'CNSUS_PCTP', 'CNSUS_PCTW',
                                                    'COUNTY_TYPE', 'DON_POLIT',
                                                    'EDUCATION', 'ETHNIC_INFER',


In [19]:
import pandas as pd

def extract_feature_importance_from_lr_model(models, model_key, feature_names):
    """
    Extracts and sorts feature importances for a given trained Logistic Regression model.
    
    Parameters:
        models (dict): Dictionary containing the trained models.
        model_key (str): Key for the Logistic Regression model in the models dictionary.
        feature_names (list): List of feature names used in the model training.
        
    Returns:
        sorted_feature_importances (pd.Series): Sorted feature importances.
    """
    # Extract the Logistic Regression model
    lr_model = models[model_key].named_steps['classifier']
    
    # Check if the model has 'coef_' attribute
    if not hasattr(lr_model, 'coef_'):
        raise AttributeError("The model does not have coefficients. Make sure it's a fitted Logistic Regression model.")
    
    # Extract and sort the feature importances
    feature_importances = pd.Series(lr_model.coef_[0], index=feature_names)
    sorted_feature_importances = feature_importances.abs().sort_values(ascending=False)
    
    return sorted_feature_importances

# Feature names used for training the model. Replace with your actual list.
feature_names = columns_to_use

# Extract and print sorted feature importances
sorted_feature_importances = extract_feature_importance_from_lr_model(models, 'LogisticRegression', feature_names)
print(sorted_feature_importances)

ValueError: Length of values (86) does not match length of index (87)

In [1]:
def iterative_feature_removal(df, target_col, initial_model_type='RF', num_iterations=3, num_features_to_remove=1):
    all_features = df.columns.tolist()
    all_features.remove(target_col)
    
    global_best_score = 0
    global_best_features = []
    
    for iteration in range(num_iterations):
        print(f"Starting iteration {iteration + 1}")
        best_score = 0
        best_features = []
        remaining_features = all_features.copy()

        while len(remaining_features) > 0:
            models, evaluation_results = prepare_and_fit_model(
                df[remaining_features + [target_col]], 
                target_col, 
                model_type=initial_model_type, 
                feature_importance=True, 
                evaluate_models=True
            )
            
            # Get the trained model
            clf = models[initial_model_type]
            
            # Get feature importance
            importances = clf.named_steps['classifier'].feature_importances_
            
            # Evaluate model
            X_test = df[remaining_features]
            y_test = df[target_col]
            y_pred = clf.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            print(f"Score: {score} in iteration {iteration + 1}")
            
            if score > best_score:
                best_score = score
                best_features = remaining_features.copy()
                
            # Sort features by importance
            feature_importance = dict(zip(remaining_features, importances))
            sorted_features = sorted(feature_importance.items(), key=lambda x: x[1])
            
            # Remove the least important features
            features_to_remove = [f[0] for f in sorted_features[:num_features_to_remove]]
            for feature in features_to_remove:
                remaining_features.remove(feature)
            
            print(f"Removed {features_to_remove}, new best score: {best_score}, remaining features: {len(remaining_features)}")

        if best_score > global_best_score:
            global_best_score = best_score
            global_best_features = best_features

        # Reset all_features to the best from this iteration for the next cycle
        all_features = best_features
        

    return global_best_features

best_features = iterative_feature_removal(df_survey_processed, 'Q1_Candidate', initial_model_type='RF', num_iterations=10, num_features_to_remove=100)

NameError: name 'df_survey_processed' is not defined