In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# ----------------------------------------------------------------
# 1. Load data (assume 'df' is already loaded in memory)
# ----------------------------------------------------------------

# Example: If you have a CSV, you'd do:
df = pd.read_csv("vote_gun_demo_opinions.csv")

# ----------------------------------------------------------------
# 2. Create a binary outcome for 2020 Vote
#    For instance, let's define:
#    1 if 20Vote == "Democrat"
#    0 otherwise (Republican, Independent, Other, No answer, etc.)
# ----------------------------------------------------------------
def make_binary_vote(v):
    return 1 if v == "Democrat" else 0

df["vote2020_binary"] = df["20Vote"].apply(make_binary_vote)

# ----------------------------------------------------------------
# 3. Define a variable for 2016 voting behavior (predictor)
#    Similarly, we can do a simple binary: 1 if 'Democrat', else 0
# ----------------------------------------------------------------
def make_binary_vote_16(v):
    return 1 if v == "Democrat" else 0

df["vote2016_binary"] = df["16Vote"].apply(make_binary_vote_16)

# ----------------------------------------------------------------
# 4. Select your 2020 features
#    For illustration, let's pick some numeric columns from 2020.
#    You can expand or refine this list as you wish.
# ----------------------------------------------------------------
predictor_cols_20 = [
    #"20GunHarder",       # numeric or recoded 
    #"20GunImportance",   # numeric or recoded
    "20GunHowMany",      # numeric
    "20Age",             # numeric
    "20Income",          # numeric
    "20SocMed",          # numeric
    "20HandleHealth",    # numeric
    "20HandleImmig",     # numeric
    "20Feminist",        # numeric (0-100 scale)
    "20Liberal",         # numeric (0-100 scale)
    "20Blm",             # numeric (0-100 scale)
    # ... add more if relevant
]

# ----------------------------------------------------------------
# 5. Combine 2016 voting behavior + the selected 2020 features
# ----------------------------------------------------------------
predictors = ["vote2016_binary"] + predictor_cols_20

# ----------------------------------------------------------------
# 6. Drop rows with missing data in any of these columns or outcome
# ----------------------------------------------------------------
model_data = df.dropna(subset=predictors + ["vote2020_binary"]).copy()

# ----------------------------------------------------------------
# 7. Create X, y
# ----------------------------------------------------------------
X = model_data[predictors]
y = model_data["vote2020_binary"]

# ----------------------------------------------------------------
# 8. Train-test split (or cross-validation)
# ----------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# ----------------------------------------------------------------
# 9. Fit Logistic Regression
#    We can use some regularization (C=1.0 default).
#    If you want to see coefficients more easily, 
#    you might turn off penalty or pick a larger C. 
# ----------------------------------------------------------------
model = LogisticRegression(
  # no regularization (use carefully)
    solver="lbfgs",
    max_iter=1000
)

model.fit(X_train, y_train)

# ----------------------------------------------------------------
# 10. Evaluate
# ----------------------------------------------------------------
y_pred = model.predict(X_test)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# ----------------------------------------------------------------
# 11. Look at Coefficients
# ----------------------------------------------------------------
# The model has a single set of coefficients for each predictor
coeffs = pd.DataFrame({
    "Predictor": X.columns,
    "Coefficient": model.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

print("\n=== Logistic Regression Coefficients ===")
print(coeffs)

# If you'd like to interpret them as odds ratios:
coeffs["OddsRatio"] = np.exp(coeffs["Coefficient"])
print("\n=== With Odds Ratios ===")
print(coeffs)

# You might find interesting which variables have the largest positive
# or negative effect on the probability of voting Democrat (in 2020).

=== Confusion Matrix ===
[[405  27]
 [ 57 205]]

=== Classification Report ===
              precision    recall  f1-score   support

           0       0.88      0.94      0.91       432
           1       0.88      0.78      0.83       262

    accuracy                           0.88       694
   macro avg       0.88      0.86      0.87       694
weighted avg       0.88      0.88      0.88       694


=== Logistic Regression Coefficients ===
         Predictor  Coefficient
0  vote2016_binary     2.619750
6    20HandleImmig     0.436856
5   20HandleHealth     0.311764
4         20SocMed     0.098029
8        20Liberal     0.014091
7       20Feminist     0.008799
9            20Blm     0.008110
2            20Age     0.003327
3         20Income    -0.017408
1     20GunHowMany    -0.055867

=== With Odds Ratios ===
         Predictor  Coefficient  OddsRatio
0  vote2016_binary     2.619750  13.732289
6    20HandleImmig     0.436856   1.547833
5   20HandleHealth     0.311764   1.365833
4 

In [53]:
df.describe()

Unnamed: 0,16GunHowMany,20GunHowMany,16Age,20Age,16Income,20Income,16Marriage,20Marriage,16SocMed,20SocMed,...,20Conservatives,20Gay,20Congress,20Muslims,20Jews,20Christ,20Police,20Transgender,20Scientist,20Blm
count,2686.0,2624.0,2759.0,2747.0,2724.0,2661.0,2822.0,2826.0,2834.0,2652.0,...,2619.0,2630.0,2644.0,2624.0,2623.0,2638.0,2650.0,2626.0,2645.0,2637.0
mean,1.432986,1.548399,49.455962,52.859847,16.157122,11.993611,2.874557,2.722222,2.190896,2.147436,...,54.787323,64.9,44.213691,57.616616,72.822722,72.02464,71.772075,58.744478,78.875236,51.380736
std,4.170547,5.312135,17.034521,16.566437,7.847302,6.635598,2.116728,2.053527,0.805848,0.813724,...,28.225116,26.343841,21.820635,24.295803,21.864978,24.799181,24.478697,27.14619,20.098893,35.409933
min,0.0,0.0,18.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,35.0,39.0,11.0,6.0,1.0,1.0,2.0,2.0,...,40.0,50.0,30.0,50.0,50.0,50.0,60.0,50.0,70.0,15.0
50%,0.0,0.0,50.0,54.0,17.0,12.0,1.0,1.0,2.0,2.0,...,50.0,60.0,50.0,50.0,70.0,75.0,75.0,50.0,85.0,60.0
75%,1.0,1.0,63.0,67.0,23.0,18.0,5.0,4.0,3.0,3.0,...,80.0,85.0,60.0,70.0,90.0,100.0,88.75,85.0,100.0,85.0
max,99.0,99.0,90.0,80.0,28.0,22.0,6.0,6.0,4.0,4.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [123]:
df = pd.read_csv("vote_gun_demo_opinions.csv")


0                Strong Republican
1       Not very strong Republican
2           Independent-Republican
3             Independent-Democrat
4                  Strong Democrat
                   ...            
2834             Strong Republican
2835               Strong Democrat
2836      Not very strong Democrat
2837               Strong Democrat
2838               Strong Democrat
Name: 16VoteSum, Length: 2839, dtype: object

In [129]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# 1) Read the CSV file
df = pd.read_csv("vote_gun_demo_opinions.csv")

# --------------------------------------------------------------
# 2) Prepare columns: define your 2020 numeric features + 16VoteSum
#    Adjust this list to match your actual numeric 2020 columns
# --------------------------------------------------------------
twenty_features = ['20HandleHealth',  '20GunHowMany','20HandleImmig','20SocMed', '20Age',
'20Fundamentalist', '20Feminist', '20Liberal',
       '20Union', '20BigBusiness', '20Conservatives', '20Gay', '20Congress',
       '20Muslims', '20Jews', '20Christ', '20Police', '20Transgender',
       '20Scientist', '20Blm']


# We'll include 16VoteSum as part of our predictors
predictors = twenty_features + ["16VoteSum"]

# --------------------------------------------------------------
# 3) Ensure 16VoteSum and 20VoteSum are numeric
# --------------------------------------------------------------
mapping_16 = {
    "Strong Democrat": 1,
    "Not very strong Democrat": 2,
    "Independent-Democrat": 3,
    "Independent": 4,
    "Independent-Republican": 5,
    "Not very strong Republican": 6,
    "Strong Republican": 7
}

# For 2016
df["16VoteSum"] = df["16VoteSum"].map(mapping_16)
df["20VoteSum"] = df["20VoteSum"].map(mapping_16)
print(df["20VoteSum"])

# --------------------------------------------------------------
# 4) Drop rows where any needed column is missing
# --------------------------------------------------------------
df_clean = df.dropna(subset=predictors + ["20VoteSum"]).copy()

# --------------------------------------------------------------
# 5) Create X (predictors) and y (target)
# --------------------------------------------------------------
X = df_clean[predictors]
y = df_clean["20VoteSum"]  # This is multiclass (1..7, for example)

# --------------------------------------------------------------
# 6) Train/Test Split
# --------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # helps preserve class proportions
)

# --------------------------------------------------------------
# 7) (Optional) Scale numeric features to help with convergence
# --------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --------------------------------------------------------------
# 8) Multinomial Logistic Regression
#    We'll use solver='lbfgs' which supports multi_class='multinomial'
# --------------------------------------------------------------
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    penalty='l2',
    C=1.0,         # inverse of regularization strength
    max_iter=500,  # increase if you see convergence warnings
    random_state=42
)

model.fit(X_train_scaled, y_train)

# --------------------------------------------------------------
# 9) Evaluate Predictions
# --------------------------------------------------------------
y_pred = model.predict(X_test_scaled)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# --------------------------------------------------------------
# 10) Look at Coefficients
#     model.coef_ is shape [n_classes, n_features]
# --------------------------------------------------------------
classes_ = model.classes_  # e.g., array([1,2,3,4,5,6,7]) if those are the labels
coefs = model.coef_
intercepts = model.intercept_

print("\n=== Multinomial Regression Coefficients ===")
for i, cls in enumerate(classes_):
    print(f"\nClass '{cls}' vs. others:")
    print("  Intercept:", intercepts[i])
    for j, col in enumerate(predictors):
        print(f"  {col}: {coefs[i][j]:.4f}")

# --------------------------------------------------------------
# 11) (Optional) Calculate an Ordinal Metric
#     e.g., the average absolute difference |y_pred - y_true|
# --------------------------------------------------------------
def mean_ordinal_error(y_true, y_pred):
    # Both are numeric arrays (e.g., 1..7)
    return np.mean(np.abs(y_true - y_pred))

moe = mean_ordinal_error(y_test, y_pred)
print(f"\nMean Ordinal Error (0=perfect): {moe:.2f}")

# Another approach: Weighted MSE
def mean_squared_ordinal_error(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

mse = mean_squared_ordinal_error(y_test, y_pred)
print(f"Mean Squared Ordinal Error: {mse:.2f}")

0       7.0
1       4.0
2       5.0
3       4.0
4       1.0
       ... 
2834    1.0
2835    1.0
2836    1.0
2837    2.0
2838    7.0
Name: 20VoteSum, Length: 2839, dtype: float64
=== Confusion Matrix ===
[[102   5   7   5   1   0   0]
 [ 21  14   4   8   0   3   0]
 [ 23   2  20   4   2   0   0]
 [  5   3  11   9   6   7   3]
 [  3   0   2   4   9   3  22]
 [  2   0   0   4   7   7  23]
 [  0   1   0   4   4   4  83]]

=== Classification Report ===
              precision    recall  f1-score   support

         1.0       0.65      0.85      0.74       120
         2.0       0.56      0.28      0.37        50
         3.0       0.45      0.39      0.42        51
         4.0       0.24      0.20      0.22        44
         5.0       0.31      0.21      0.25        43
         6.0       0.29      0.16      0.21        43
         7.0       0.63      0.86      0.73        96

    accuracy                           0.55       447
   macro avg       0.45      0.42      0.42       447
weight

In [130]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# 1) Read the CSV file
df = pd.read_csv("vote_gun_demo_opinions.csv")

# --------------------------------------------------------------
# 2) Prepare columns: define your 2020 numeric features + 16VoteSum
#    Adjust this list to match your actual numeric 2020 columns
# --------------------------------------------------------------
twenty_features = ['20HandleHealth',  '20GunHowMany','20HandleImmig','20SocMed', '20Age',
'20Fundamentalist', '20Feminist', '20Liberal',
       '20Union', '20BigBusiness', '20Conservatives', '20Gay', '20Congress',
       '20Muslims', '20Jews', '20Christ', '20Police', '20Transgender',
       '20Scientist', '20Blm']


# We'll include 16VoteSum as part of our predictors
predictors = twenty_features 

# --------------------------------------------------------------
# 3) Ensure 16VoteSum and 20VoteSum are numeric
# --------------------------------------------------------------
mapping_16 = {
    "Strong Democrat": 1,
    "Not very strong Democrat": 2,
    "Independent-Democrat": 3,
    "Independent": 4,
    "Independent-Republican": 5,
    "Not very strong Republican": 6,
    "Strong Republican": 7
}

# For 2016
df["20VoteSum"] = df["20VoteSum"].map(mapping_16)

# --------------------------------------------------------------
# 4) Drop rows where any needed column is missing
# --------------------------------------------------------------
df_clean = df.dropna(subset=predictors + ["20VoteSum"]).copy()

# --------------------------------------------------------------
# 5) Create X (predictors) and y (target)
# --------------------------------------------------------------
X = df_clean[predictors]
y = df_clean["20VoteSum"]  # This is multiclass (1..7, for example)

# --------------------------------------------------------------
# 6) Train/Test Split
# --------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # helps preserve class proportions
)

# --------------------------------------------------------------
# 7) (Optional) Scale numeric features to help with convergence
# --------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --------------------------------------------------------------
# 8) Multinomial Logistic Regression
#    We'll use solver='lbfgs' which supports multi_class='multinomial'
# --------------------------------------------------------------
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    penalty='l2',
    C=1.0,         # inverse of regularization strength
    max_iter=500,  # increase if you see convergence warnings
    random_state=42
)

model.fit(X_train_scaled, y_train)

# --------------------------------------------------------------
# 9) Evaluate Predictions
# --------------------------------------------------------------
y_pred = model.predict(X_test_scaled)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# --------------------------------------------------------------
# 10) Look at Coefficients
#     model.coef_ is shape [n_classes, n_features]
# --------------------------------------------------------------
classes_ = model.classes_  # e.g., array([1,2,3,4,5,6,7]) if those are the labels
coefs = model.coef_
intercepts = model.intercept_

print("\n=== Multinomial Regression Coefficients ===")
for i, cls in enumerate(classes_):
    print(f"\nClass '{cls}' vs. others:")
    print("  Intercept:", intercepts[i])
    for j, col in enumerate(predictors):
        print(f"  {col}: {coefs[i][j]:.4f}")

# --------------------------------------------------------------
# 11) (Optional) Calculate an Ordinal Metric
#     e.g., the average absolute difference |y_pred - y_true|
# --------------------------------------------------------------
def mean_ordinal_error(y_true, y_pred):
    # Both are numeric arrays (e.g., 1..7)
    return np.mean(np.abs(y_true - y_pred))

moe = mean_ordinal_error(y_test, y_pred)
print(f"\nMean Ordinal Error (0=perfect): {moe:.2f}")

# Another approach: Weighted MSE
def mean_squared_ordinal_error(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

mse = mean_squared_ordinal_error(y_test, y_pred)
print(f"Mean Squared Ordinal Error: {mse:.2f}")

=== Confusion Matrix ===
[[95  8  5  6  1  1  5]
 [26 10  4  6  0  4  0]
 [34  4  7  4  1  1  0]
 [10  7  5 10  0  6  7]
 [ 5  2  0  2  6  3 25]
 [ 2  3  1  4  4  4 25]
 [ 0  0  2  3  3  4 84]]

=== Classification Report ===
              precision    recall  f1-score   support

         1.0       0.55      0.79      0.65       121
         2.0       0.29      0.20      0.24        50
         3.0       0.29      0.14      0.19        51
         4.0       0.29      0.22      0.25        45
         5.0       0.40      0.14      0.21        43
         6.0       0.17      0.09      0.12        43
         7.0       0.58      0.88      0.69        96

    accuracy                           0.48       449
   macro avg       0.37      0.35      0.34       449
weighted avg       0.42      0.48      0.43       449


=== Multinomial Regression Coefficients ===

Class '1.0' vs. others:
  Intercept: -0.23435662099650925
  20HandleHealth: 0.8213
  20GunHowMany: -0.1603
  20HandleImmig: 0.5949
 

In [135]:
import pandas as pd
df = pd.read_csv("vote_gun_demo_opinions.csv")

# Example 2020 columns
cols_2020 = [
    "20HandleHealth", "20GunHowMany", "20HandleImmig", "20SocMed", "20Age",
    "20Fundamentalist", "20Feminist", "20Liberal", "20Union", "20BigBusiness",
    "20Conservatives", "20Gay", "20Congress", "20Muslims", "20Jews",
    "20Christ", "20Police", "20Transgender", "20Scientist", "20Blm"
]

difference_columns = []

for col_20 in cols_2020:
    # Construct the matching 2016 column by replacing '20' with '16'
    col_16 = col_20.replace("20", "16", 1)  
    # e.g., "20HandleHealth" -> "16HandleHealth"
    
    # Construct a new name for the difference column.
    # Example: "diff_HandleHealth" if the column is "20HandleHealth"
    # (You can pick your own naming pattern.)
    diff_col = "diff_" + col_20[2:]  # removes '20', e.g. "HandleHealth"
    
    # Compute the difference and store in a new column
    # Make sure both col_16 and col_20 exist and are numeric
    df[diff_col] = df[col_20] - df[col_16]
    
    # Keep track of the new column name
    difference_columns.append(diff_col)

# Now 'difference_columns' contains the list of all new diff columns.
print("Difference columns created:", difference_columns)

Difference columns created: ['diff_HandleHealth', 'diff_GunHowMany', 'diff_HandleImmig', 'diff_SocMed', 'diff_Age', 'diff_Fundamentalist', 'diff_Feminist', 'diff_Liberal', 'diff_Union', 'diff_BigBusiness', 'diff_Conservatives', 'diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm']


In [136]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# 1) Read the CSV file

# --------------------------------------------------------------
# 2) Prepare columns: define your 2020 numeric features + 16VoteSum
#    Adjust this list to match your actual numeric 2020 columns
# --------------------------------------------------------------
twenty_features = ['diff_HandleHealth', 'diff_GunHowMany', 'diff_HandleImmig', 'diff_SocMed', 'diff_Age', 'diff_Fundamentalist', 'diff_Feminist', 'diff_Liberal', 'diff_Union', 'diff_BigBusiness', 'diff_Conservatives', 'diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm']

# We'll include 16VoteSum as part of our predictors
predictors = twenty_features 

# --------------------------------------------------------------
# 3) Ensure 16VoteSum and 20VoteSum are numeric
# --------------------------------------------------------------
mapping_16 = {
    "Strong Democrat": 1,
    "Not very strong Democrat": 2,
    "Independent-Democrat": 3,
    "Independent": 4,
    "Independent-Republican": 5,
    "Not very strong Republican": 6,
    "Strong Republican": 7
}

# For 2016
df["20VoteSum"] = df["20VoteSum"].map(mapping_16)

# --------------------------------------------------------------
# 4) Drop rows where any needed column is missing
# --------------------------------------------------------------
df_clean = df.dropna(subset=predictors + ["20VoteSum"]).copy()

# --------------------------------------------------------------
# 5) Create X (predictors) and y (target)
# --------------------------------------------------------------
X = df_clean[predictors]
y = df_clean["20VoteSum"]  # This is multiclass (1..7, for example)

# --------------------------------------------------------------
# 6) Train/Test Split
# --------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # helps preserve class proportions
)

# --------------------------------------------------------------
# 7) (Optional) Scale numeric features to help with convergence
# --------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --------------------------------------------------------------
# 8) Multinomial Logistic Regression
#    We'll use solver='lbfgs' which supports multi_class='multinomial'
# --------------------------------------------------------------
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    penalty='l2',
    C=1.0,         # inverse of regularization strength
    max_iter=500,  # increase if you see convergence warnings
    random_state=42
)

model.fit(X_train_scaled, y_train)

# --------------------------------------------------------------
# 9) Evaluate Predictions
# --------------------------------------------------------------
y_pred = model.predict(X_test_scaled)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# --------------------------------------------------------------
# 10) Look at Coefficients
#     model.coef_ is shape [n_classes, n_features]
# --------------------------------------------------------------
classes_ = model.classes_  # e.g., array([1,2,3,4,5,6,7]) if those are the labels
coefs = model.coef_
intercepts = model.intercept_

print("\n=== Multinomial Regression Coefficients ===")
for i, cls in enumerate(classes_):
    print(f"\nClass '{cls}' vs. others:")
    print("  Intercept:", intercepts[i])
    for j, col in enumerate(predictors):
        print(f"  {col}: {coefs[i][j]:.4f}")

# --------------------------------------------------------------
# 11) (Optional) Calculate an Ordinal Metric
#     e.g., the average absolute difference |y_pred - y_true|
# --------------------------------------------------------------
def mean_ordinal_error(y_true, y_pred):
    # Both are numeric arrays (e.g., 1..7)
    return np.mean(np.abs(y_true - y_pred))

moe = mean_ordinal_error(y_test, y_pred)
print(f"\nMean Ordinal Error (0=perfect): {moe:.2f}")

# Another approach: Weighted MSE
def mean_squared_ordinal_error(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

mse = mean_squared_ordinal_error(y_test, y_pred)
print(f"Mean Squared Ordinal Error: {mse:.2f}")

=== Confusion Matrix ===
[[102   1   3   1   1   0   1]
 [ 21   4   2   9   0   5   3]
 [ 34   2   5   4   0   0   1]
 [ 11   2   2   5   4   5  10]
 [  1   0   0   4   3   4  26]
 [  6   3   1   0   7   2  19]
 [  0   0   0   1   3   7  73]]

=== Classification Report ===
              precision    recall  f1-score   support

         1.0       0.58      0.94      0.72       109
         2.0       0.33      0.09      0.14        44
         3.0       0.38      0.11      0.17        46
         4.0       0.21      0.13      0.16        39
         5.0       0.17      0.08      0.11        38
         6.0       0.09      0.05      0.07        38
         7.0       0.55      0.87      0.67        84

    accuracy                           0.49       398
   macro avg       0.33      0.32      0.29       398
weighted avg       0.40      0.49      0.41       398


=== Multinomial Regression Coefficients ===

Class '1.0' vs. others:
  Intercept: 0.1094863152112759
  diff_HandleHealth: 1.3839

In [139]:
import pandas as pd
df = pd.read_csv("vote_gun_demo_opinions.csv")

# Example 2020 columns
cols_2020 = [
    "20HandleHealth", "20GunHowMany", "20HandleImmig", "20SocMed", "20Age",
    "20Fundamentalist", "20Feminist", "20Liberal", "20Union", "20BigBusiness",
    "20Conservatives", "20Gay", "20Congress", "20Muslims", "20Jews",
    "20Christ", "20Police", "20Transgender", "20Scientist", "20Blm"
]

difference_columns = []

for col_20 in cols_2020:
    # Construct the matching 2016 column by replacing '20' with '16'
    col_16 = col_20.replace("20", "16", 1)  
    # e.g., "20HandleHealth" -> "16HandleHealth"
    
    # Construct a new name for the difference column.
    # Example: "diff_HandleHealth" if the column is "20HandleHealth"
    # (You can pick your own naming pattern.)
    diff_col = "diff_" + col_20[2:]  # removes '20', e.g. "HandleHealth"
    
    # Compute the difference and store in a new column
    # Make sure both col_16 and col_20 exist and are numeric
    df[diff_col] = df[col_20] - df[col_16]
    
    # Keep track of the new column name
    difference_columns.append(diff_col)

# Now 'difference_columns' contains the list of all new diff columns.
print("Difference columns created:", difference_columns)

Difference columns created: ['diff_HandleHealth', 'diff_GunHowMany', 'diff_HandleImmig', 'diff_SocMed', 'diff_Age', 'diff_Fundamentalist', 'diff_Feminist', 'diff_Liberal', 'diff_Union', 'diff_BigBusiness', 'diff_Conservatives', 'diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm']


In [140]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# 1) Read the CSV file

# --------------------------------------------------------------
# 2) Prepare columns: define your 2020 numeric features + 16VoteSum
#    Adjust this list to match your actual numeric 2020 columns
# --------------------------------------------------------------
twenty_features = ['diff_HandleHealth', 'diff_GunHowMany', 'diff_HandleImmig', 'diff_SocMed', 'diff_Age', 'diff_Fundamentalist', 'diff_Feminist', 'diff_Liberal', 'diff_Union', 'diff_BigBusiness', 'diff_Conservatives', 'diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm']


# We'll include 16VoteSum as part of our predictors
predictors = twenty_features + ["16VoteSum"]

# --------------------------------------------------------------
# 3) Ensure 16VoteSum and 20VoteSum are numeric
# --------------------------------------------------------------
mapping_16 = {
    "Strong Democrat": 1,
    "Not very strong Democrat": 2,
    "Independent-Democrat": 3,
    "Independent": 4,
    "Independent-Republican": 5,
    "Not very strong Republican": 6,
    "Strong Republican": 7
}

# For 2016
df["16VoteSum"] = df["16VoteSum"].map(mapping_16)
df["20VoteSum"] = df["20VoteSum"].map(mapping_16)
print(df["20VoteSum"])

# --------------------------------------------------------------
# 4) Drop rows where any needed column is missing
# --------------------------------------------------------------
df_clean = df.dropna(subset=predictors + ["20VoteSum"]).copy()

# --------------------------------------------------------------
# 5) Create X (predictors) and y (target)
# --------------------------------------------------------------
X = df_clean[predictors]
y = df_clean["20VoteSum"]  # This is multiclass (1..7, for example)

# --------------------------------------------------------------
# 6) Train/Test Split
# --------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # helps preserve class proportions
)

# --------------------------------------------------------------
# 7) (Optional) Scale numeric features to help with convergence
# --------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --------------------------------------------------------------
# 8) Multinomial Logistic Regression
#    We'll use solver='lbfgs' which supports multi_class='multinomial'
# --------------------------------------------------------------
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    penalty='l2',
    C=1.0,         # inverse of regularization strength
    max_iter=500,  # increase if you see convergence warnings
    random_state=42
)

model.fit(X_train_scaled, y_train)

# --------------------------------------------------------------
# 9) Evaluate Predictions
# --------------------------------------------------------------
y_pred = model.predict(X_test_scaled)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# --------------------------------------------------------------
# 10) Look at Coefficients
#     model.coef_ is shape [n_classes, n_features]
# --------------------------------------------------------------
classes_ = model.classes_  # e.g., array([1,2,3,4,5,6,7]) if those are the labels
coefs = model.coef_
intercepts = model.intercept_

print("\n=== Multinomial Regression Coefficients ===")
for i, cls in enumerate(classes_):
    print(f"\nClass '{cls}' vs. others:")
    print("  Intercept:", intercepts[i])
    for j, col in enumerate(predictors):
        print(f"  {col}: {coefs[i][j]:.4f}")

# --------------------------------------------------------------
# 11) (Optional) Calculate an Ordinal Metric
#     e.g., the average absolute difference |y_pred - y_true|
# --------------------------------------------------------------
def mean_ordinal_error(y_true, y_pred):
    # Both are numeric arrays (e.g., 1..7)
    return np.mean(np.abs(y_true - y_pred))

moe = mean_ordinal_error(y_test, y_pred)
print(f"\nMean Ordinal Error (0=perfect): {moe:.2f}")

# Another approach: Weighted MSE
def mean_squared_ordinal_error(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

mse = mean_squared_ordinal_error(y_test, y_pred)
print(f"Mean Squared Ordinal Error: {mse:.2f}")

0       7.0
1       4.0
2       5.0
3       4.0
4       1.0
       ... 
2834    1.0
2835    1.0
2836    1.0
2837    2.0
2838    7.0
Name: 20VoteSum, Length: 2839, dtype: float64
=== Confusion Matrix ===
[[97  3  7  1  1  0  0]
 [22  4  4  8  1  3  1]
 [17  6 19  3  1  0  0]
 [ 4  3  7  6  9  6  4]
 [ 1  0  1  7  5  5 19]
 [ 1  0  3  4 10  3 17]
 [ 0  1  0  4  8  4 67]]

=== Classification Report ===
              precision    recall  f1-score   support

         1.0       0.68      0.89      0.77       109
         2.0       0.24      0.09      0.13        43
         3.0       0.46      0.41      0.44        46
         4.0       0.18      0.15      0.17        39
         5.0       0.14      0.13      0.14        38
         6.0       0.14      0.08      0.10        38
         7.0       0.62      0.80      0.70        84

    accuracy                           0.51       397
   macro avg       0.35      0.37      0.35       397
weighted avg       0.44      0.51      0.46       397



In [141]:
import pandas as pd
df = pd.read_csv("vote_gun_demo_opinions.csv")

# Example 2020 columns
cols_2020 = [
    "20HandleHealth", "20GunHowMany", "20HandleImmig", "20SocMed", "20Age",
    "20Fundamentalist", "20Feminist", "20Liberal", "20Union", "20BigBusiness",
    "20Conservatives", "20Gay", "20Congress", "20Muslims", "20Jews",
    "20Christ", "20Police", "20Transgender", "20Scientist", "20Blm"
]

difference_columns = []

for col_20 in cols_2020:
    # Construct the matching 2016 column by replacing '20' with '16'
    col_16 = col_20.replace("20", "16", 1)  
    # e.g., "20HandleHealth" -> "16HandleHealth"
    
    # Construct a new name for the difference column.
    # Example: "diff_HandleHealth" if the column is "20HandleHealth"
    # (You can pick your own naming pattern.)
    diff_col = "diff_" + col_20[2:]  # removes '20', e.g. "HandleHealth"
    
    # Compute the difference and store in a new column
    # Make sure both col_16 and col_20 exist and are numeric
    df[diff_col] = df[col_20] - df[col_16]
    
    # Keep track of the new column name
    difference_columns.append(diff_col)

# Now 'difference_columns' contains the list of all new diff columns.
print("Difference columns created:", difference_columns)

Difference columns created: ['diff_HandleHealth', 'diff_GunHowMany', 'diff_HandleImmig', 'diff_SocMed', 'diff_Age', 'diff_Fundamentalist', 'diff_Feminist', 'diff_Liberal', 'diff_Union', 'diff_BigBusiness', 'diff_Conservatives', 'diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm']


In [142]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

# 1) Read the CSV file

# --------------------------------------------------------------
# 2) Prepare columns: define your 2020 numeric features + 16VoteSum
#    Adjust this list to match your actual numeric 2020 columns
# --------------------------------------------------------------
twenty_features = ['diff_HandleHealth', 'diff_GunHowMany', 'diff_HandleImmig', 'diff_SocMed', 'diff_Age', 'diff_Fundamentalist', 'diff_Feminist', 'diff_Liberal', 'diff_Union', 'diff_BigBusiness', 'diff_Conservatives', 'diff_Gay', 'diff_Congress', 'diff_Muslims', 'diff_Jews', 'diff_Christ', 'diff_Police', 'diff_Transgender', 'diff_Scientist', 'diff_Blm','20HandleHealth',  '20GunHowMany','20HandleImmig','20SocMed', '20Age',
'20Fundamentalist', '20Feminist', '20Liberal',
       '20Union', '20BigBusiness', '20Conservatives', '20Gay', '20Congress',
       '20Muslims', '20Jews', '20Christ', '20Police', '20Transgender',
       '20Scientist', '20Blm']


# We'll include 16VoteSum as part of our predictors
predictors = twenty_features + ["16VoteSum"]

# --------------------------------------------------------------
# 3) Ensure 16VoteSum and 20VoteSum are numeric
# --------------------------------------------------------------
mapping_16 = {
    "Strong Democrat": 1,
    "Not very strong Democrat": 2,
    "Independent-Democrat": 3,
    "Independent": 4,
    "Independent-Republican": 5,
    "Not very strong Republican": 6,
    "Strong Republican": 7
}

# For 2016
df["16VoteSum"] = df["16VoteSum"].map(mapping_16)
df["20VoteSum"] = df["20VoteSum"].map(mapping_16)
print(df["20VoteSum"])

# --------------------------------------------------------------
# 4) Drop rows where any needed column is missing
# --------------------------------------------------------------
df_clean = df.dropna(subset=predictors + ["20VoteSum"]).copy()

# --------------------------------------------------------------
# 5) Create X (predictors) and y (target)
# --------------------------------------------------------------
X = df_clean[predictors]
y = df_clean["20VoteSum"]  # This is multiclass (1..7, for example)

# --------------------------------------------------------------
# 6) Train/Test Split
# --------------------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y  # helps preserve class proportions
)

# --------------------------------------------------------------
# 7) (Optional) Scale numeric features to help with convergence
# --------------------------------------------------------------
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

# --------------------------------------------------------------
# 8) Multinomial Logistic Regression
#    We'll use solver='lbfgs' which supports multi_class='multinomial'
# --------------------------------------------------------------
model = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    penalty='l2',
    C=1.0,         # inverse of regularization strength
    max_iter=500,  # increase if you see convergence warnings
    random_state=42
)

model.fit(X_train_scaled, y_train)

# --------------------------------------------------------------
# 9) Evaluate Predictions
# --------------------------------------------------------------
y_pred = model.predict(X_test_scaled)

print("=== Confusion Matrix ===")
print(confusion_matrix(y_test, y_pred))

print("\n=== Classification Report ===")
print(classification_report(y_test, y_pred))

# --------------------------------------------------------------
# 10) Look at Coefficients
#     model.coef_ is shape [n_classes, n_features]
# --------------------------------------------------------------
classes_ = model.classes_  # e.g., array([1,2,3,4,5,6,7]) if those are the labels
coefs = model.coef_
intercepts = model.intercept_

print("\n=== Multinomial Regression Coefficients ===")
for i, cls in enumerate(classes_):
    print(f"\nClass '{cls}' vs. others:")
    print("  Intercept:", intercepts[i])
    for j, col in enumerate(predictors):
        print(f"  {col}: {coefs[i][j]:.4f}")

# --------------------------------------------------------------
# 11) (Optional) Calculate an Ordinal Metric
#     e.g., the average absolute difference |y_pred - y_true|
# --------------------------------------------------------------
def mean_ordinal_error(y_true, y_pred):
    # Both are numeric arrays (e.g., 1..7)
    return np.mean(np.abs(y_true - y_pred))

moe = mean_ordinal_error(y_test, y_pred)
print(f"\nMean Ordinal Error (0=perfect): {moe:.2f}")

# Another approach: Weighted MSE
def mean_squared_ordinal_error(y_true, y_pred):
    return np.mean((y_true - y_pred)**2)

mse = mean_squared_ordinal_error(y_test, y_pred)
print(f"Mean Squared Ordinal Error: {mse:.2f}")

0       7.0
1       4.0
2       5.0
3       4.0
4       1.0
       ... 
2834    1.0
2835    1.0
2836    1.0
2837    2.0
2838    7.0
Name: 20VoteSum, Length: 2839, dtype: float64
=== Confusion Matrix ===
[[98  4  5  0  1  1  0]
 [20  5  5  5  1  5  2]
 [19  4 17  5  1  0  0]
 [ 3  6  6  9  7  6  2]
 [ 1  0  1  5  8  8 15]
 [ 1  1  1  5 10  3 17]
 [ 0  1  0  3  7  5 68]]

=== Classification Report ===
              precision    recall  f1-score   support

         1.0       0.69      0.90      0.78       109
         2.0       0.24      0.12      0.16        43
         3.0       0.49      0.37      0.42        46
         4.0       0.28      0.23      0.25        39
         5.0       0.23      0.21      0.22        38
         6.0       0.11      0.08      0.09        38
         7.0       0.65      0.81      0.72        84

    accuracy                           0.52       397
   macro avg       0.38      0.39      0.38       397
weighted avg       0.47      0.52      0.49       397

