In [14]:
import pandas as pd

In [3]:

df = pd.read_csv("vote_gun_demo_opinions.csv")


In [4]:
df.head()

Unnamed: 0,16Vote,16StrongVote,16Closer,16VoteSum,16GunHarder,16GunImportance,16GunHowMany,20Vote,20StrongVote,20Closer,...,20Conservatives,20Gay,20Congress,20Muslims,20Jews,20Christ,20Police,20Transgender,20Scientist,20Blm
0,Republican,Strong,Inapplicable,Strong Republican,Same as now,Important,10.0,Republican,Strong,Inapplicable,...,85.0,60.0,50.0,30.0,70.0,100.0,100.0,50.0,100.0,15.0
1,Republican,Weak,Inapplicable,Not very strong Republican,Same as now,Most important,0.0,Independent,No answer,Neither,...,100.0,50.0,50.0,50.0,50.0,50.0,70.0,50.0,100.0,0.0
2,No answer,No answer,Republican,Independent-Republican,Same as now,Most important,,Other,No answer,Republican,...,50.0,0.0,60.0,50.0,50.0,90.0,80.0,0.0,50.0,0.0
3,Independent,No answer,Democrat,Independent-Democrat,More strict,Most important,0.0,Independent,No answer,Neither,...,50.0,100.0,60.0,100.0,100.0,50.0,60.0,100.0,85.0,85.0
4,Democrat,Strong,Inapplicable,Strong Democrat,More strict,Most important,0.0,Democrat,Strong,Inapplicable,...,50.0,85.0,70.0,85.0,85.0,85.0,70.0,50.0,15.0,100.0


## 16 Vote + 20 variables

In [5]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

# ========================
# 1) Read the CSV file
# (Assume you've already read your DataFrame into 'df')
# ========================

# --------------------------------------------------------------
# 2) Prepare columns: define your 2020 numeric features + 16VoteSum
# --------------------------------------------------------------
twenty_features = [
    '20HandleHealth', '20GunHowMany', '20HandleImmig', '20SocMed', '20Age',
    '20Fundamentalist', '20Feminist', '20Liberal', '20Union', '20BigBusiness',
    '20Conservatives', '20Gay', '20Congress', '20Muslims', '20Jews', '20Christ',
    '20Police', '20Transgender', '20Scientist', '20Blm'
]

# We'll include 16VoteSum as part of our predictors
predictors = twenty_features + ["16VoteSum"]

# --------------------------------------------------------------
# 3) Ensure 16VoteSum and 20VoteSum are numeric
#    (Map them to 1..7 as you've done before)
# --------------------------------------------------------------
mapping_16 = {
    "Strong Democrat": 1,
    "Not very strong Democrat": 2,
    "Independent-Democrat": 3,
    "Independent": 4,
    "Independent-Republican": 5,
    "Not very strong Republican": 6,
    "Strong Republican": 7
}

df["16VoteSum"] = df["16VoteSum"].map(mapping_16)
df["20VoteSum"] = df["20VoteSum"].map(mapping_16)

# --------------------------------------------------------------
# 4) Drop rows where any needed column is missing
# --------------------------------------------------------------
df_clean = df.dropna(subset=predictors + ["20VoteSum"]).copy()

# --------------------------------------------------------------
# 5) Create a new binary target: 0=Dem, 1=Rep
#    We'll exclude respondents coded as 4 (Independent).
# --------------------------------------------------------------
# Keep only rows with 20VoteSum <= 3 or >= 5
df_clean = df_clean[(df_clean["20VoteSum"] <= 3) | (df_clean["20VoteSum"] >= 5)]

# Map to 0=Dem (1,2,3), 1=Rep (5,6,7)
df_clean["20VoteBinary"] = np.where(df_clean["20VoteSum"] <= 3, 0, 1)

# --------------------------------------------------------------
# 6) Define X (predictors) and y (binary target)
# --------------------------------------------------------------
X = df_clean[predictors]
y = df_clean["20VoteBinary"]

# --------------------------------------------------------------
# 7) Train/Test Split
# --------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # helps preserve class proportions
)

# --------------------------------------------------------------
# 8) (Optional) Scale numeric features to help with convergence
# --------------------------------------------------------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --------------------------------------------------------------
# 9) Logistic Regression (binary)
# --------------------------------------------------------------
model = LogisticRegression(
    solver='lbfgs',
    penalty='l2',
    C=1.0,
    max_iter=500,
    random_state=42
)

model.fit(X_train_scaled, y_train)

# --------------------------------------------------------------
# 10) Evaluate Predictions
# --------------------------------------------------------------
y_pred = model.predict(X_test_scaled)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred, target_names=["Democrat","Republican"]))

# --------------------------------------------------------------
# 11) Examine Model Coefficients
#     Now there's only one 'row' of coefficients for the single logistic regression decision boundary.
# --------------------------------------------------------------
coefs = model.coef_[0]
intercept = model.intercept_[0]

print("\n=== Logistic Regression Coefficients (Binary) ===")
print(f"Intercept: {intercept:.4f}")
for feature_name, coef_val in zip(predictors, coefs):
    print(f"{feature_name}: {coef_val:.4f}")

=== Confusion Matrix ===
[[210  11]
 [ 14 168]]

=== Classification Report ===
              precision    recall  f1-score   support

    Democrat       0.94      0.95      0.94       221
  Republican       0.94      0.92      0.93       182

    accuracy                           0.94       403
   macro avg       0.94      0.94      0.94       403
weighted avg       0.94      0.94      0.94       403


=== Logistic Regression Coefficients (Binary) ===
Intercept: -0.2111
20HandleHealth: -1.0647
20GunHowMany: 0.0212
20HandleImmig: -0.5463
20SocMed: -0.3382
20Age: 0.0314
20Fundamentalist: 0.0024
20Feminist: -0.3086
20Liberal: -0.3714
20Union: -0.1734
20BigBusiness: -0.0633
20Conservatives: 1.1138
20Gay: 0.0174
20Congress: -0.0386
20Muslims: -0.1852
20Jews: -0.1291
20Christ: -0.1374
20Police: 0.2442
20Transgender: -0.1076
20Scientist: -0.1913
20Blm: -0.4207
16VoteSum: 2.1387


## 20 Variables

In [17]:
import pandas as pd
df = pd.read_csv("vote_gun_demo_opinions.csv")

# Example 2020 columns
cols_2020 = [
    "20HandleHealth", "20GunHowMany", "20HandleImmig", "20SocMed", "20Age",
    "20Fundamentalist", "20Feminist", "20Liberal", "20Union", "20BigBusiness",
    "20Conservatives", "20Gay", "20Congress", "20Muslims", "20Jews",
    "20Christ", "20Police", "20Transgender", "20Scientist", "20Blm"
]

difference_columns = []

for col_20 in cols_2020:
    # Construct the matching 2016 column by replacing '20' with '16'
    col_16 = col_20.replace("20", "16", 1)  
    # e.g., "20HandleHealth" -> "16HandleHealth"
    
    # Construct a new name for the difference column.
    # Example: "diff_HandleHealth" if the column is "20HandleHealth"
    # (You can pick your own naming pattern.)
    diff_col = "diff_" + col_20[2:]  # removes '20', e.g. "HandleHealth"
    
    # Compute the difference and store in a new column
    # Make sure both col_16 and col_20 exist and are numeric
    df[diff_col] = df[col_20] - df[col_16]
    
    # Keep track of the new column name
    difference_columns.append(diff_col)

# Now 'difference_columns' contains the list of all new diff columns.
print("Difference columns created:", difference_columns)

Difference columns created: ['diff_HandleHealth', 'diff_GunHowMany', 'diff_HandleImmig', 'diff_SocMed', 'diff_Age', 'diff_Fundamentalist', 'diff_Feminist', 'diff_Liberal', 'diff_Union', 'diff_BigBusiness', 'diff_Conservatives', 'diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm']


In [141]:
import pandas as pd
df = pd.read_csv("vote_gun_demo_opinions.csv")

# Example 2020 columns
cols_2020 = [
    "20HandleHealth", "20GunHowMany", "20HandleImmig", "20SocMed", "20Age",
    "20Fundamentalist", "20Feminist", "20Liberal", "20Union", "20BigBusiness",
    "20Conservatives", "20Gay", "20Congress", "20Muslims", "20Jews",
    "20Christ", "20Police", "20Transgender", "20Scientist", "20Blm"
]

difference_columns = []

for col_20 in cols_2020:
    # Construct the matching 2016 column by replacing '20' with '16'
    col_16 = col_20.replace("20", "16", 1)  
    # e.g., "20HandleHealth" -> "16HandleHealth"
    
    # Construct a new name for the difference column.
    # Example: "diff_HandleHealth" if the column is "20HandleHealth"
    # (You can pick your own naming pattern.)
    diff_col = "diff_" + col_20[2:]  # removes '20', e.g. "HandleHealth"
    
    # Compute the difference and store in a new column
    # Make sure both col_16 and col_20 exist and are numeric
    df[diff_col] = df[col_20] - df[col_16]
    
    # Keep track of the new column name
    difference_columns.append(diff_col)

# Now 'difference_columns' contains the list of all new diff columns.
print("Difference columns created:", difference_columns)

Difference columns created: ['diff_HandleHealth', 'diff_GunHowMany', 'diff_HandleImmig', 'diff_SocMed', 'diff_Age', 'diff_Fundamentalist', 'diff_Feminist', 'diff_Liberal', 'diff_Union', 'diff_BigBusiness', 'diff_Conservatives', 'diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm']


In [24]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler


df = pd.read_csv("vote_gun_demo_opinions.csv")

# Example 2020 columns
cols_2020 = [
    "20HandleHealth", "20GunHowMany", "20HandleImmig", "20SocMed", "20Age",
    "20Fundamentalist", "20Feminist", "20Liberal", "20Union", "20BigBusiness",
    "20Conservatives", "20Gay", "20Congress", "20Muslims", "20Jews",
    "20Christ", "20Police", "20Transgender", "20Scientist", "20Blm"
]

difference_columns = []

for col_20 in cols_2020:
    # Construct the matching 2016 column by replacing '20' with '16'
    col_16 = col_20.replace("20", "16", 1)  
    # e.g., "20HandleHealth" -> "16HandleHealth"
    
    # Construct a new name for the difference column.
    # Example: "diff_HandleHealth" if the column is "20HandleHealth"
    # (You can pick your own naming pattern.)
    diff_col = "diff_" + col_20[2:]  # removes '20', e.g. "HandleHealth"
    
    # Compute the difference and store in a new column
    # Make sure both col_16 and col_20 exist and are numeric
    df[diff_col] = df[col_20] - df[col_16]
    
    # Keep track of the new column name
    difference_columns.append(diff_col)
    

# Example column lists (adjust as needed)
Features_16 = [
 '16Gay', '16Congress',
       '16Muslims', '16Jews', '16Christ', '16Police', '16Transgender',
       '16Scientist', '16Blm']
Features_20 = [
'20Gay', '20Congress',
       '20Muslims', '20Jews', '20Christ', '20Police', '20Transgender',
       '20Scientist', '20Blm']
Differences = ['diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm']  # your difference columns
Vote_16_col = "16VoteSum"  # or "16VoteBinary", depending on your setup
Vote_20_col = "20VoteSum"  # or "20VoteBinary", if you're doing a binary classification
# =======================================================
# 1) Define your DataFrame 'df' and your original mappings
#    for 16VoteSum / 20VoteSum (1..7). Example:
# =======================================================
mapping_7pt = {
    "Strong Democrat": 1,
    "Not very strong Democrat": 2,
    "Independent-Democrat": 3,
    "Independent": 4,
    "Independent-Republican": 5,
    "Not very strong Republican": 6,
    "Strong Republican": 7
}

# Suppose your raw data is in 'df_raw':
# df = df_raw.copy()
df["16VoteSum"] = df["16VoteSum"].map(mapping_7pt)
df["20VoteSum"] = df["20VoteSum"].map(mapping_7pt)

# =======================================================
# 2) Create binary columns for the 16 and 20 votes.
#    We'll keep only Democrat or Republican, exclude Ind(4).
# =======================================================
df = df[ df["16VoteSum"].isin([1,2,3,5,6,7]) &
         df["20VoteSum"].isin([1,2,3,5,6,7]) ].copy()

df["16VoteBinary"] = np.where(df["16VoteSum"] <= 3, 0, 1)  # 0=Dem, 1=Rep
df["20VoteBinary"] = np.where(df["20VoteSum"] <= 3, 0, 1)  # 0=Dem, 1=Rep


# If you want to use the 2016 vote as a feature, we can use 16VoteBinary (rather than 16VoteSum).
Vote_16_col = "16VoteBinary"

# Your target will be 20VoteBinary
Vote_20_col = "20VoteBinary"

# =======================================================
# 4) Create a list of scenarios (scenario_id, list_of_features)
#    We'll always predict 20VoteBinary as our target.
# =======================================================
scenarios = [
    (0, Features_16),
    (1, Features_16 + [Vote_16_col]),
    (2, Features_20),
    (3, Features_20 + [Vote_16_col]),
    (4, Differences),
    (5, Differences + Features_20),
    (6, Differences + [Vote_16_col]),
    (7, Differences + Features_20 + [Vote_16_col]),
]

# =======================================================
# 5) Drop rows with missing data in any of the columns we need.
#    Combine them all just in case.
# =======================================================
all_feature_cols = set(Features_16 + Features_20 + Differences + [Vote_16_col])
df_clean = df.dropna(subset=all_feature_cols.union({Vote_20_col})).copy()

# =======================================================
# 6) Prepare a structure to save results
# =======================================================
results = []

# =======================================================
# 7) Loop over each scenario
# =======================================================
for scenario_id, feature_cols in scenarios:
    # 7a) Define X and y
    X = df_clean[feature_cols]
    y = df_clean[Vote_20_col]  # 0=Dem, 1=Rep

    # 7b) Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.2,
        random_state=42,
        stratify=y  # helps preserve class distribution
    )

    # 7c) Scale numeric features (optional)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled  = scaler.transform(X_test)

    # 7d) Fit logistic regression for binary classification
    model = LogisticRegression(
        solver='lbfgs',
        penalty='l2',
        C=1.0,
        max_iter=500,
        random_state=42
    )
    model.fit(X_train_scaled, y_train)

    # 7e) Make predictions and evaluate
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    report_dict = classification_report(y_test, y_pred, output_dict=True)

    # 7f) Store results
    results.append({
        "Scenario": scenario_id,
        "Feature_Count": len(feature_cols),
        "Feature_List": feature_cols,
        "Accuracy": acc,
        "Report": report_dict
    })

    # 7g) Print partial results (optional)
    print(f"=== Scenario {scenario_id} ===")
    print(f"Features: {feature_cols}")
    print(f"Accuracy: {acc:.3f}")
    print(classification_report(y_test, y_pred, target_names=['Dem','Rep']))
    print("--------------------------------\n")

# =======================================================
# 8) Convert results to a DataFrame or save to file
# =======================================================
results_df = pd.DataFrame(results)

# For a quick overview, let's just see scenario + accuracy:
print("=== Final Summary ===")
print(results_df[["Scenario","Accuracy"]])

# If desired, save to CSV or pickle:
results_df.to_csv("scenario_results_binary.csv", index=False)


=== Scenario 0 ===
Features: ['16Gay', '16Congress', '16Muslims', '16Jews', '16Christ', '16Police', '16Transgender', '16Scientist', '16Blm']
Accuracy: 0.833
              precision    recall  f1-score   support

         Dem       0.84      0.84      0.84       213
         Rep       0.82      0.82      0.82       187

    accuracy                           0.83       400
   macro avg       0.83      0.83      0.83       400
weighted avg       0.83      0.83      0.83       400

--------------------------------

=== Scenario 1 ===
Features: ['16Gay', '16Congress', '16Muslims', '16Jews', '16Christ', '16Police', '16Transgender', '16Scientist', '16Blm', '16VoteBinary']
Accuracy: 0.902
              precision    recall  f1-score   support

         Dem       0.90      0.92      0.91       213
         Rep       0.91      0.88      0.89       187

    accuracy                           0.90       400
   macro avg       0.90      0.90      0.90       400
weighted avg       0.90      0.90    