In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

## Predicting who is planning on voting Republican vs Democrat... 
### with all features:

In [33]:

# Load the dataset again
data = pd.read_csv('data/surveydata.csv')


In [19]:


# Filter out potential data leakage columns
filtered_data = data.drop(columns=['Q2_Support', 'Q3_Party', 'Q4_LikelyVoter', 'Q5_TrumpSupport', 'SURVEY_TYPE', 'RECORD_ID'], errors='ignore')

# Create the binary target variable
filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x in ['President Joe Biden','Marianne Williamson','Robert F. Kennedy Jr.'] else 1)

# Split the data into training (80%) and testing (20%) sets
X = filtered_data.drop(columns=['target', 'Q1_Candidate'])  # Features (excluding 'target' and 'Q1_Candidate')
y = filtered_data['target']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify and convert categorical columns to string type:
# This line uses a list comprehension to identify columns in the X_train dataset where the data type is object, which typically 
# means the column contains string or categorical data. The resulting list of column names is stored in categorical_cols.
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

# These lines ensure that all values in the identified categorical columns are of string type. 
# This is done to ensure consistency when applying transformations like encoding.
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Preprocessing:
# These preprocessing steps ensure that the data is clean, consistent, and in the right format for the Gradient Boosting Machine (GBM) or any other machine learning model.

# Here, we define a Pipeline for preprocessing categorical data:
categorical_transformer = Pipeline(steps=[
    
# The 'imputer' step uses the SimpleImputer to fill any missing values in the categorical columns. The strategy 'most_frequent' 
# fills missing values with the most frequent value in the column.
    ('imputer', SimpleImputer(strategy='most_frequent')),
    
# The 'encoder' step applies OneHotEncoder to convert categorical values into a format suitable for modeling. It creates binary columns 
# for each category and indicates the presence of the category with a 1 or 0. The handle_unknown='ignore' parameter ensures that if the 
# model encounters an unknown category in the test set (i.e., a category not seen during training), it will ignore it.
    
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

# Define the numerical transformer:
#This step uses the SimpleImputer to fill any missing values in the numerical columns. The strategy 'mean' fills missing values with the mean of the column.
numerical_transformer = SimpleImputer(strategy='mean')

# Combining the Transformers:
# The ColumnTransformer allows different columns or column subsets of the input data to be transformed separately. Here, we apply the 
# numerical_transformer to the numerical columns and the categorical_transformer to the categorical columns. This way, we can preprocess 
# numerical and categorical data differently within the same transformer.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X_train.select_dtypes(exclude=['object']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Applying the Preprocessor:
# The fit_transform method is used on the training set to learn any parameters (like mean values for imputation or categories for one-hot encoding)
# and apply the transformations. The transform method is then used on the test set to apply the same transformations using the parameters learned from the training set.
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# GBM Classifier
gbm_classifier = GradientBoostingClassifier(random_state=42)
gbm_classifier.fit(X_train_preprocessed, y_train)

# Predictions and Evaluation
y_pred = gbm_classifier.predict(X_test_preprocessed)
accuracy_after_filtering = accuracy_score(y_test, y_pred)
classification_rep_after_filtering = classification_report(y_test, y_pred)

print(accuracy_after_filtering)
print(classification_rep_after_filtering)

0.7724358974358975
              precision    recall  f1-score   support

           0       0.73      0.60      0.66       114
           1       0.79      0.87      0.83       198

    accuracy                           0.77       312
   macro avg       0.76      0.74      0.74       312
weighted avg       0.77      0.77      0.77       312



In [20]:
# Get feature importances from the GBM model
feature_importances = gbm_classifier.feature_importances_

# Get the one-hot encoded feature names from the preprocessor
ohe_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)

# Combine the original numerical column names with the one-hot encoded feature names
all_feature_names = list(X_train.select_dtypes(exclude=['object']).columns) + list(ohe_feature_names)

# Create a DataFrame to map feature names to their importances
feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance and get the top 20 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)

top_features

Unnamed: 0,Feature,Importance
1691,PRFL_IMMIGRATION_REFORM_,0.130365
1692,PRFL_IMMIGRATION_REFORM_Y,0.054376
13,PRFL_CHOICELIFE,0.050129
1711,PRFL_POLITICAL_IDEOLOGY_C,0.049499
1700,PRFL_LIBERAL_NEWS_nan,0.028351
1678,PRFL_CONSERVATIVE_NEWS_nan,0.027855
2931,VTR_PPP20_,0.027759
29,VP_GEN,0.023986
1641,PARTY_CODE_R,0.023372
2,CENSUS_TRK,0.02198


### with selected features (columns that begin with "PRF", "DON", "VTR", and "TOD"):

In [21]:

# Select columns based on the specified prefixes
selected_columns = [col for col in data.columns if col.startswith(("PRF", "DON", "VTR", "TOD",'Q1_'))]

# Extract the selected columns along with the 'target' column
filtered_data = data[selected_columns]

# Create the binary target variable
filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x in ['President Joe Biden','Marianne Williamson','Robert F. Kennedy Jr.'] else 1)

# Split the data into training (80%) and testing (20%) sets
X = filtered_data.drop(columns=['target', 'Q1_Candidate'])  # Features (excluding 'target' and 'Q1_Candidate')
y = filtered_data['target']                                # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify and convert categorical columns to string type
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = SimpleImputer(strategy='mean')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X_train.select_dtypes(exclude=['object']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# GBM Classifier
gbm_classifier = GradientBoostingClassifier(random_state=42)
gbm_classifier.fit(X_train_preprocessed, y_train)

# Predictions and Evaluation
y_pred = gbm_classifier.predict(X_test_preprocessed)
accuracy_after_filtering = accuracy_score(y_test, y_pred)
classification_rep_after_filtering = classification_report(y_test, y_pred)

print(accuracy_after_filtering)
print(classification_rep_after_filtering)





A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x in ['President Joe Biden','Marianne Williamson','Robert F. Kennedy Jr.'] else 1)


0.7724358974358975
              precision    recall  f1-score   support

           0       0.75      0.57      0.65       114
           1       0.78      0.89      0.83       198

    accuracy                           0.77       312
   macro avg       0.76      0.73      0.74       312
weighted avg       0.77      0.77      0.76       312



In [22]:
# Get feature importances from the GBM model
feature_importances = gbm_classifier.feature_importances_

# Get the one-hot encoded feature names from the preprocessor
ohe_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)

# Combine the original numerical column names with the one-hot encoded feature names
all_feature_names = list(X_train.select_dtypes(exclude=['object']).columns) + list(ohe_feature_names)

# Create a DataFrame to map feature names to their importances
feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance and get the top 20 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)

top_features

Unnamed: 0,Feature,Importance
75,PRFL_IMMIGRATION_REFORM_Y,0.119809
74,PRFL_IMMIGRATION_REFORM_,0.085944
94,PRFL_POLITICAL_IDEOLOGY_C,0.065242
0,PRFL_CHOICELIFE,0.052726
826,VTR_PPP20_,0.03216
83,PRFL_LIBERAL_NEWS_nan,0.031418
9,TOD_PRES_O_2016_PREC,0.030075
830,VTR_PPP20_M,0.024635
690,VTR_GEN22_,0.021873
987,VTR_PRI22_,0.021587


### with only voter data (columns that begin with "VTR", and "TOD"):

In [27]:

# Select columns based on the specified prefixes
selected_columns = [col for col in data.columns if col.startswith(("VTR", "TOD",'Q1_'))]

# Extract the selected columns along with the 'target' column
filtered_data = data[selected_columns]

# Create the binary target variable
filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x in ['President Joe Biden','Marianne Williamson','Robert F. Kennedy Jr.'] else 1)

# Split the data into training (80%) and testing (20%) sets
X = filtered_data.drop(columns=['target', 'Q1_Candidate'])  # Features (excluding 'target' and 'Q1_Candidate')
y = filtered_data['target']                                # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify and convert categorical columns to string type
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = SimpleImputer(strategy='mean')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X_train.select_dtypes(exclude=['object']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# GBM Classifier
gbm_classifier = GradientBoostingClassifier(random_state=42)
gbm_classifier.fit(X_train_preprocessed, y_train)

# Predictions and Evaluation
y_pred = gbm_classifier.predict(X_test_preprocessed)
accuracy_after_filtering = accuracy_score(y_test, y_pred)
classification_rep_after_filtering = classification_report(y_test, y_pred)

print(accuracy_after_filtering)
print(classification_rep_after_filtering)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x in ['President Joe Biden','Marianne Williamson','Robert F. Kennedy Jr.'] else 1)


0.7307692307692307
              precision    recall  f1-score   support

           0       0.72      0.43      0.54       114
           1       0.73      0.90      0.81       198

    accuracy                           0.73       312
   macro avg       0.73      0.67      0.67       312
weighted avg       0.73      0.73      0.71       312



In [28]:
# Get feature importances from the GBM model
feature_importances = gbm_classifier.feature_importances_

# Get the one-hot encoded feature names from the preprocessor
ohe_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)

# Combine the original numerical column names with the one-hot encoded feature names
all_feature_names = list(X_train.select_dtypes(exclude=['object']).columns) + list(ohe_feature_names)

# Create a DataFrame to map feature names to their importances
feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance and get the top 20 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)

top_features

Unnamed: 0,Feature,Importance
728,VTR_PPP20_M,0.067716
7,TOD_PRES_R_2016_PREC,0.059066
2,TOD_PRES_D_2020_PREC,0.047259
720,VTR_PPP16_R,0.040123
891,VTR_PRI22_R,0.03435
1,TOD_PRES_D_2016_PREC,0.034179
726,VTR_PPP20_D,0.03408
576,VTR_GEN20_A,0.032336
733,VTR_PPP20_Z,0.029489
6,TOD_PRES_R_2016,0.027735


## Predicting which Republican supporters support Trump vs Other Republican... 
### with all features:

In [23]:

# Filter out respondents who said they would vote for ‘President Joe Biden’, ‘Robert F. Kennedy Jr.’, or ‘Marianne Williamson’
filtered_data = data[~data['Q1_Candidate'].isin(['President Joe Biden', 'Robert F. Kennedy Jr.', 'Marianne Williamson'])]

# Filter out potential data leakage columns
filtered_data = filtered_data.drop(columns=['Q2_Support', 'Q3_Party', 'Q4_LikelyVoter', 'Q5_TrumpSupport', 'SURVEY_TYPE', 'RECORD_ID'], errors='ignore')

# Create the binary target variable
filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x == 'President Donald Trump' else 1)

# Split the data into training (80%) and testing (20%) sets
X = filtered_data.drop(columns=['target', 'Q1_Candidate'])  # Features (excluding 'target' and 'Q1_Candidate')
y = filtered_data['target']                                # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify and convert categorical columns to string type
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = SimpleImputer(strategy='mean')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X_train.select_dtypes(exclude=['object']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# GBM Classifier
gbm_classifier = GradientBoostingClassifier(random_state=42)
gbm_classifier.fit(X_train_preprocessed, y_train)

# Predictions and Evaluation
y_pred = gbm_classifier.predict(X_test_preprocessed)
accuracy_after_filtering = accuracy_score(y_test, y_pred)
classification_rep_after_filtering = classification_report(y_test, y_pred)

print(accuracy_after_filtering) 
print(classification_rep_after_filtering)

0.555
              precision    recall  f1-score   support

           0       0.57      0.54      0.56       103
           1       0.54      0.57      0.55        97

    accuracy                           0.56       200
   macro avg       0.56      0.56      0.55       200
weighted avg       0.56      0.56      0.56       200



In [24]:
# Get feature importances from the GBM model
feature_importances = gbm_classifier.feature_importances_

# Get the one-hot encoded feature names from the preprocessor
ohe_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)

# Combine the original numerical column names with the one-hot encoded feature names
all_feature_names = list(X_train.select_dtypes(exclude=['object']).columns) + list(ohe_feature_names)

# Create a DataFrame to map feature names to their importances
feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance and get the top 20 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)

top_features

Unnamed: 0,Feature,Importance
2,CENSUS_TRK,0.039683
23,TOD_PRES_O_2020_PREC,0.036358
22,TOD_PRES_O_2016_PREC,0.030787
1124,HH_SIZE_6,0.026457
25,TOD_PRES_R_2016_PREC,0.025009
1407,PRFL_LIBERAL_NEWS_Y,0.024191
11,COUNTY_ST,0.023232
2476,VTR_OTH13_,0.021771
7,CNSUS_PCTM,0.020247
2626,VTR_PRI13_,0.017542


### with selected features (columns that begin with "PRF", "DON", "VTR", and "TOD"):

In [25]:

# Load the dataset again
data = pd.read_csv('data/surveydata.csv')

# Select columns based on the specified prefixes
selected_columns = [col for col in data.columns if col.startswith(("PRF", "DON", "VTR", "TOD",'Q1_'))]

# Extract the selected columns along with the 'target' column
data = data[selected_columns]
data

# Filter out respondents who said they would vote for ‘President Joe Biden’, ‘Robert F. Kennedy Jr.’, or ‘Marianne Williamson’
filtered_data = data[~data['Q1_Candidate'].isin(['President Joe Biden', 'Robert F. Kennedy Jr.', 'Marianne Williamson'])]

# Create the binary target variable
filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x == 'President Donald Trump' else 1)

# Split the data into training (80%) and testing (20%) sets
X = filtered_data.drop(columns=['target', 'Q1_Candidate'])  # Features (excluding 'target' and 'Q1_Candidate')
y = filtered_data['target']                                # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify categorical columns
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']

# Convert all categorical columns to string type
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Create transformers
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = SimpleImputer(strategy='mean')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X_train.select_dtypes(exclude=['object']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Preprocess training and testing data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Initialize and train the GBM classifier
gbm_classifier = GradientBoostingClassifier(random_state=42)
gbm_classifier.fit(X_train_preprocessed, y_train)

# Predict on the test set
y_pred = gbm_classifier.predict(X_test_preprocessed)

# Evaluate the model's performance
accuracy_all_columns = accuracy_score(y_test, y_pred)
classification_rep_all_columns = classification_report(y_test, y_pred)

print("""Predicting Trump vs other Repub candidates with "PRF", "DON", "VTR", and "TOD" columns""")
print(accuracy_all_columns) 
print(classification_rep_all_columns)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x == 'President Donald Trump' else 1)


Predicting Trump vs other Repub candidates with "PRF", "DON", "VTR", and "TOD" columns
0.59
              precision    recall  f1-score   support

           0       0.61      0.55      0.58       103
           1       0.57      0.63      0.60        97

    accuracy                           0.59       200
   macro avg       0.59      0.59      0.59       200
weighted avg       0.59      0.59      0.59       200



In [26]:
# Get feature importances from the GBM model
feature_importances = gbm_classifier.feature_importances_

# Get the one-hot encoded feature names from the preprocessor
ohe_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)

# Combine the original numerical column names with the one-hot encoded feature names
all_feature_names = list(X_train.select_dtypes(exclude=['object']).columns) + list(ohe_feature_names)

# Create a DataFrame to map feature names to their importances
feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance and get the top 20 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)

top_features

Unnamed: 0,Feature,Importance
9,TOD_PRES_O_2016_PREC,0.073974
10,TOD_PRES_O_2020_PREC,0.065441
12,TOD_PRES_R_2016_PREC,0.040869
11,TOD_PRES_R_2016,0.036261
700,VTR_OTH13_,0.030118
850,VTR_PRI13_,0.027019
81,PRFL_LIBERAL_NEWS_Y,0.024982
427,TOD_PRES_DIFF_2020_PREC_12R,0.018054
6,TOD_PRES_D_2016_PREC,0.01772
60,PRFL_CONSERVATIVE_NEWS_Y,0.016548


### with only voter data (columns that begin with "VTR", and "TOD"):

In [34]:
# Select columns based on the specified prefixes
selected_columns = [col for col in data.columns if col.startswith(("VTR", "TOD",'Q1_'))]

# Extract the selected columns along with the 'target' column
filtered_data = data[selected_columns]

# Filter out respondents who said they would vote for ‘President Joe Biden’, ‘Robert F. Kennedy Jr.’, or ‘Marianne Williamson’
filtered_data = filtered_data[~filtered_data['Q1_Candidate'].isin(['President Joe Biden', 'Robert F. Kennedy Jr.', 'Marianne Williamson'])]

# Create the binary target variable
filtered_data['target'] = filtered_data['Q1_Candidate'].apply(lambda x: 0 if x == 'President Donald Trump' else 1)

# Split the data into training (80%) and testing (20%) sets
X = filtered_data.drop(columns=['target', 'Q1_Candidate'])  # Features (excluding 'target' and 'Q1_Candidate')
y = filtered_data['target']                                # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify and convert categorical columns to string type
categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
X_train[categorical_cols] = X_train[categorical_cols].astype(str)
X_test[categorical_cols] = X_test[categorical_cols].astype(str)

# Preprocessing
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])
numerical_transformer = SimpleImputer(strategy='mean')
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, X_train.select_dtypes(exclude=['object']).columns),
        ('cat', categorical_transformer, categorical_cols)
    ])
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# GBM Classifier
gbm_classifier = GradientBoostingClassifier(random_state=42)
gbm_classifier.fit(X_train_preprocessed, y_train)

# Predictions and Evaluation
y_pred = gbm_classifier.predict(X_test_preprocessed)
accuracy_after_filtering = accuracy_score(y_test, y_pred)
classification_rep_after_filtering = classification_report(y_test, y_pred)

print(accuracy_after_filtering)
print(classification_rep_after_filtering)


0.59
              precision    recall  f1-score   support

           0       0.61      0.58      0.59       103
           1       0.57      0.60      0.59        97

    accuracy                           0.59       200
   macro avg       0.59      0.59      0.59       200
weighted avg       0.59      0.59      0.59       200



In [35]:
# Get feature importances from the GBM model
feature_importances = gbm_classifier.feature_importances_

# Get the one-hot encoded feature names from the preprocessor
ohe_feature_names = preprocessor.named_transformers_['cat']['encoder'].get_feature_names_out(categorical_cols)

# Combine the original numerical column names with the one-hot encoded feature names
all_feature_names = list(X_train.select_dtypes(exclude=['object']).columns) + list(ohe_feature_names)

# Create a DataFrame to map feature names to their importances
feature_importance_df = pd.DataFrame({
    'Feature': all_feature_names,
    'Importance': feature_importances
})

# Sort the DataFrame by importance and get the top 20 features
top_features = feature_importance_df.sort_values(by='Importance', ascending=False).head(20)

top_features

Unnamed: 0,Feature,Importance
4,TOD_PRES_O_2016_PREC,0.074298
5,TOD_PRES_O_2020_PREC,0.05461
6,TOD_PRES_R_2016,0.053594
7,TOD_PRES_R_2016_PREC,0.047318
600,VTR_OTH13_,0.035806
1,TOD_PRES_D_2016_PREC,0.033817
3,TOD_PRES_O_2016,0.024384
750,VTR_PRI13_,0.023137
8,TOD_PRES_R_2020_PREC,0.022452
527,VTR_GEN18_E,0.019196
