In [8]:
#!pip install pyreadstat
import pyreadstat
import pandas as pd

df_path = "anes_timeseries_2020_stata_20210324.dta"


[ 1. -1.  2. -9. -8.]


In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal, bernoulli, beta, norm
from scipy.special import expit as logistic_sigmoid
import statsmodels.api as sm
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [75]:
# Read file
df, meta = pyreadstat.read_dta(df_path)
print(len(df['V200001'].unique())) # number of unique people who responded, should be equal to length of df before filtering

# Keep only observations from respondents who say they intend to vote
print("Shape before filtering:", df.shape)
df = df[df['V201032'].isin([1.])]
print("Shape after filtering:", df.shape)

# Define the outcome of interest as 1 if the voter will vote for the Republicans, 0 otherwise
df = df[(df['V201033'] != -9) & (df['V201033'] != -8)]
trump_votes_count = df['V201033'].value_counts().get(2.0, 0)
print("Number of people who prefer Trump:", trump_votes_count)

# Create binary outcome variable Y
Y = df['V201033'].apply(lambda x: 1 if x == 2.0 else 0)

# Filter out missing values from X_df based on indices of Y
X_df = pd.DataFrame({
    'Age': pd.to_numeric(df['V201507x'], errors='coerce'),
    'College_Degree_or_Higher': df['V201510'].apply(lambda x: 1 if x in ["6. Bachelor's degree (e.g. BA, AB, BS)", "7. Master's degree (e.g. MA, MS, MEng, MEd, MSW, MBA)", "4. Associate degree in college - occupational/vocational", "5. Associate degree in college - academic"] else 0),
    'Trust_in_Media': df['V201377'].apply(lambda x: 1 if (x == 5 or x == 4 or x == 3) else 0)
})

# Ensure consistent indices and drop missing values
X_df = X_df.dropna()
Y = Y.dropna()
Y = Y.loc[X_df.index]
print(f'size of Y: {len(Y)}')
print(f'size of X_df: {len(X_df)}')

# Create design matrix
X_with_intercept = sm.add_constant(X_df)
print(f'size of X_with_intercept: {len(X_with_intercept)}')

8280
Shape before filtering: (8280, 1381)
Shape after filtering: (7272, 1381)
Number of people who prefer Trump: 3016
size of Y: 7138
size of X_df: 7138
size of X_with_intercept: 7138


In [80]:
def logistic_regression(X, y, test_size=0.2, random_state=None):
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Initialize logistic regression model
    log_reg_model = LogisticRegression()

    # Fit the model on the training data
    log_reg_model.fit(X_train, y_train)

    # Predictions on the test data
    y_pred = log_reg_model.predict(X_test)

    # Evaluate the model
    classification_rep = classification_report(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    return classification_rep, conf_matrix, accuracy

classification_rep, conf_matrix, accuracy = logistic_regression(X_with_intercept, Y, test_size=0.3, random_state=420)
print("Classification Report:")
print(classification_rep)
print("\nConfusion Matrix:")
print(conf_matrix)
print(f'Accuracy:  {accuracy:.4f}')

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.75      0.79      1206
           1       0.71      0.81      0.76       936

    accuracy                           0.77      2142
   macro avg       0.77      0.78      0.77      2142
weighted avg       0.78      0.77      0.77      2142


Confusion Matrix:
[[899 307]
 [178 758]]
Accuracy:  0.7736
