In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
import pyreadstat
# Read the .sav file
df, meta = pyreadstat.read_sav("2016_2020_mergedpanel.sav")
# Export to CSV
df.to_csv("output.csv", index=False)

In [30]:
file_path = 'output.csv'
    
print(f"Loading data from {file_path}...")
df = pd.read_csv(file_path)
print("Data successfully loaded!\n")

print("Initial DataFrame shape:", df.shape)

Loading data from output.csv...
Data successfully loaded!

Initial DataFrame shape: (2839, 3612)


  df = pd.read_csv(file_path)


In [31]:
df["V161361x"]

0       13.0
1       17.0
2       20.0
3        3.0
4       22.0
        ... 
2834     5.0
2835     6.0
2836    16.0
2837    21.0
2838    -9.0
Name: V161361x, Length: 2839, dtype: float64

In [4]:
# Display the first few rows
print("First 5 rows of the raw data:")
print(df.head())


First 5 rows of the raw data:
                   version2016  V160001  V160001_orig  V160101  V160101f  \
0  ANES2016TimeSeries_20190904      1.0      300001.0   0.8270    0.8877   
1  ANES2016TimeSeries_20190904      2.0      300002.0   1.0806    1.1605   
2  ANES2016TimeSeries_20190904      4.0      300004.0   0.3596    0.3852   
3  ANES2016TimeSeries_20190904      5.0      300006.0   0.6470    0.6931   
4  ANES2016TimeSeries_20190904      7.0      300008.0   3.9604    4.2512   

   V160101w  V160102  V160102f  V160102w  V160201  ...  \
0       0.0   0.8420    0.9271       0.0    121.0  ...   
1       0.0   1.0133    1.0841       0.0    123.0  ...   
2       0.0   0.3663    0.4183       0.0    118.0  ...   
3       0.0   0.6463    0.7262       0.0    113.0  ...   
4       0.0   4.6151    4.7902       0.0    105.0  ...   

                     V203518  V203519           V203520  V203521  V203522  \
0           -1. Inapplicable     -1.0  -1. Inapplicable     -1.0     -1.0   
1  Stephen

In [5]:
print("\nData info (column types, non-null counts):")
print(df.info())


Data info (column types, non-null counts):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2839 entries, 0 to 2838
Columns: 3612 entries, version2016 to V203527
dtypes: float64(3549), int64(12), object(51)
memory usage: 78.2+ MB
None


In [6]:
threshold = 0.6  # 80% missing
columns_to_drop = [col for col in df.columns 
                   if df[col].isnull().mean() > threshold]
if columns_to_drop:
    print(f"\nDropping columns with > {threshold*100}% missing values:")
    print(columns_to_drop)
    df.drop(columns=columns_to_drop, axis=1, inplace=True)
    print("New DataFrame shape:", df.shape)


Dropping columns with > 60.0% missing values:
['V162084', 'V200012a', 'V200012b', 'V200012c', 'V200012d', 'V200013a', 'V200013b', 'V200013c', 'V200013d', 'V200014a', 'V200014b', 'V200014c', 'V200014d', 'V200015a', 'V200015b', 'V200015c', 'V200015d']
New DataFrame shape: (2839, 3595)


In [8]:
print("\nBasic statistical summary (numerical columns):")
print(df.describe())

# B. Distribution of Key Variables
# Adjust these variable names to match actual columns of interest
variables_of_interest = ['age', 'income', 'ideology', 'vote_choice', 'VCF0705', 'VCF0706']
existing_vars = [var for var in variables_of_interest if var in df.columns]
valid_values = {0, 1, 2, 3, 4, 7}

# Create a filtered DataFrame
#filtered_df = df[df[['VCF0705', 'VCF0706']].isin(valid_values).all(axis=1)]
filtered_df = df
print(filtered_df.head())

if existing_vars:
    print(f"\nPlotting histograms for the following variables: {existing_vars}")
    for var in existing_vars:
        plt.figure(figsize=(6, 4))
        sns.histplot(data=filtered_df, x=var, kde=True, color='blue')
        plt.title(f"Distribution of {var}")
        plt.tight_layout()
        plt.show()
else:
    print("\nNo matching variables found for distribution plotting. Check your column names.")


Basic statistical summary (numerical columns):
           V160001   V160001_orig      V160101     V160101f     V160101w  \
count  2839.000000    2839.000000  2839.000000  2839.000000  2839.000000   
mean   2828.636492  378605.299753     0.995603     0.257698     0.736337   
std    1503.975076   44271.572388     0.659564     0.589269     0.686505   
min       1.000000  300001.000000     0.099300     0.000000     0.000000   
25%    2011.000000  400026.500000     0.576450     0.000000     0.278400   
50%    3066.000000  402698.000000     0.826900     0.000000     0.673500   
75%    4089.500000  405259.500000     1.239600     0.000000     0.981850   
max    5087.000000  407796.000000     6.813900     7.308800     5.660000   

           V160102     V160102f     V160102w      V160201     V160201f  ...  \
count  2839.000000  2839.000000  2839.000000  2839.000000  2839.000000  ...   
mean      0.995416     0.254463     0.738672    67.734414     4.139486  ...   
std       0.665582     0.58129

In [32]:
import pandas as pd

# Assume df is already loaded (e.g., from a CSV or another source).

# --- 1. Define mapping dictionaries for each variable ---

# 16Vote
vote16_map = {
    1: "Democrat",
    2: "Republican",
    3: "Independent",
    5: "Other"
}

# 16strongvote
strongvote16_map = {
    1: "Strong",
    2: "Weak",
    3: "Inapplicable"
}

# 16closer
closer16_map = {
    1: "Republican",
    2: "Neither",
    3: "Democrat",
    -1: "Inapplicable"
}

# 16votesum
votesum16_map = {
    1: "Strong Democrat",
    2: "Not very strong Democrat",
    3: "Independent-Democrat",
    4: "Independent",
    5: "Independent-Republican",
    6: "Not very strong Republican",
    7: "Strong Republican"
}

# 20Vote
vote20_map = {
    1: "Democrat",
    2: "Republican",
    3: "Independent",
    5: "Other"
}

# 20strongvote
strongvote20_map = {
    1: "Strong",
    2: "Weak",
    3: "Inapplicable"
}

# 20closer
closer20_map = {
    1: "Republican",
    2: "Neither",
    3: "Democrat",
    -1: "Inapplicable"
}

# 20votesum
votesum20_map = {
    1: "Strong Democrat",
    2: "Not very strong Democrat",
    3: "Independent-Democrat",
    4: "Independent",
    5: "Independent-Republican",
    6: "Not very strong Republican",
    7: "Strong Republican"
}

# 16gun_harder
gun_harder16_map = {
    1: "More strict",
    2: "Less strict",
    3: "Same as now"
}

# 16gun_importance
gun_importance16_map = {
    1: "Most important",
    2: "Important",
    3: "Neutral",
    4: "Less important",
    5: "Not important"
}

# 20gun_harder
gun_harder20_map = {
    1: "More strict",
    2: "Less strict",
    3: "Same as now"
}

# 20gun_importance
gun_importance20_map = {
    1: "Most important",
    2: "Important",
    3: "Neutral",
    4: "Less important",
    5: "Not important"
}

# --- 2. Create new columns with mapped values ---

# 2016
df["16Vote"] = df["V161155"].apply(lambda x: vote16_map[x] if x in vote16_map else "No answer")
df["16StrongVote"] = df["V161156"].apply(lambda x: strongvote16_map[x] if x in strongvote16_map else "No answer")
df["16Closer"] = df["V161157"].apply(lambda x: closer16_map[x] if x in closer16_map else "No answer")
df["16VoteSum"] = df["V161158x"].apply(lambda x: votesum16_map[x] if x in votesum16_map else "No answer")

df["16GunHarder"] = df["V161187"].apply(lambda x: gun_harder16_map[x] if x in gun_harder16_map else "No answer")
df["16GunImportance"] = df["V161188"].apply(lambda x: gun_importance16_map[x] if x in gun_importance16_map else "No answer")

# For 16GunHowMany, treat non-numeric or negative as "No answer"
def parse_16_gunhowmany(x):
    try:
        val = float(x)
        return val if val >= 0 else None
    except:
        return None

df["16GunHowMany"] = df["V161496"].apply(parse_16_gunhowmany)

# 2020
df["20Vote"] = df["V201228"].apply(lambda x: vote20_map[x] if x in vote20_map else "No answer")
df["20StrongVote"] = df["V201229"].apply(lambda x: strongvote20_map[x] if x in strongvote20_map else "No answer")
df["20Closer"] = df["V201230"].apply(lambda x: closer20_map[x] if x in closer20_map else "No answer")
df["20VoteSum"] = df["V201231x"].apply(lambda x: votesum20_map[x] if x in votesum20_map else "No answer")

df["20GunHarder"] = df["V202337"].apply(lambda x: gun_harder20_map[x] if x in gun_harder20_map else "No answer")
df["20GunImportance"] = df["V202338"].apply(lambda x: gun_importance20_map[x] if x in gun_importance20_map else "No answer")

# For 20GunHowMany, treat non-numeric or negative as "No answer"
def parse_20_gunhowmany(x):
    try:
        val = float(x)
        return val if val >= 0 else None
    except:
        return None

df["20GunHowMany"] = df["V201628"].apply(parse_20_gunhowmany)

# --- 3. Subset the columns of interest ---
subset_columns = [
    "16Vote", "16StrongVote", "16Closer", "16VoteSum",
    "16GunHarder", "16GunImportance", "16GunHowMany",
    "20Vote", "20StrongVote", "20Closer", "20VoteSum",
    "20GunHarder", "20GunImportance", "20GunHowMany"
]

df_subset = df[subset_columns]

# --- 4. Save to CSV ---
df_subset.to_csv("vote_and_gun.csv", index=False)

In [38]:
# Check missingness
missing_summary = df_final_subset.isna().sum() / len(df) * 100
print("Percentage of missing values per column:\n", missing_summary.sort_values(ascending=False))

Percentage of missing values per column:
 20Fundamentalist    8.770694
20Conservatives     7.749207
20Union             7.608313
20Jews              7.608313
20GunHowMany        7.573089
20Muslims           7.573089
20Transgender       7.502642
20Liberal           7.467418
20Gay               7.361747
20BigBusiness       7.220852
20Feminist          7.220852
20Blm               7.115181
20Christ            7.079958
20Congress          6.868616
20Scientist         6.833392
20Police            6.657274
20SocMed            6.586826
20Income            6.269813
16GunHowMany        5.389222
16Income            4.050722
16Fundamentalist    3.416696
20Age               3.240578
16Age               2.817894
16Transgender       1.831631
16Feminist          1.796407
16Muslims           1.761184
16Jews              1.620289
16Liberal           1.479394
16Blm               1.479394
16Gay               1.338499
16Union             1.338499
16Conservatives     1.338499
16HandleImmig       1.232828
1

In [14]:
df=df_subset

In [17]:
# Crosstab example
pd.crosstab(df["16VoteSum"], df["20VoteSum"], dropna=False)

20VoteSum,Independent,Independent-Democrat,Independent-Republican,No answer,Not very strong Democrat,Not very strong Republican,Strong Democrat,Strong Republican
16VoteSum,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Independent,151,39,55,0,30,28,15,31
Independent-Democrat,44,141,23,0,42,20,62,6
Independent-Republican,46,15,137,1,4,57,8,66
No answer,3,2,1,0,0,1,4,3
Not very strong Democrat,30,47,12,0,137,17,114,23
Not very strong Republican,24,19,54,0,21,110,11,117
Strong Democrat,12,38,2,0,62,2,487,6
Strong Republican,6,0,19,0,1,54,3,376


In [22]:
pd.crosstab(df["16GunImportance"], df["16VoteSum"], dropna=False)

16VoteSum,Independent,Independent-Democrat,Independent-Republican,No answer,Not very strong Democrat,Not very strong Republican,Strong Democrat,Strong Republican
16GunImportance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Important,111,122,109,4,120,100,192,163
Less important,32,26,17,0,33,44,25,22
Most important,92,82,107,4,106,89,263,164
Neutral,95,98,91,4,110,108,115,97
No answer,2,0,0,2,1,1,0,1
Not important,17,10,10,0,10,14,14,12


In [33]:
import pandas as pd
import numpy as np

# Assume df is already loaded (e.g., from a CSV or another source).

# ============================================================
# 1. Define helper functions or mappings for new columns
# ============================================================

def parse_age(x):
    """
    Returns age as integer if x is non-negative.
    Otherwise returns None (treated as NaN).
    """
    try:
        val = float(x)
        if val < 0:
            return None
        return int(val)
    except:
        return None

def parse_income(x):
    """
    Converts string or numeric income to an integer if non-negative.
    If negative or parse fails, returns None.
    Example: '01' -> 1, 12 -> 12, etc.
    """
    val = int(x)
    if val<0:
        return None
    else:
        return val

def parse_marriage_16_20(x):
    """
    1. Married: spouse present
    2. Married: spouse absent
    3. Widowed
    4. Divorced
    5. Separated
    6. Never married
    Any other: No answer
    """
    valid_codes = [1,2,3,4,5,6]
    return x if x in valid_codes else None

def parse_socmed_16_20(x):
    """
    Valid range is 1 to 4 inclusive.
    Otherwise, treat as no answer.
    """
    try:
        val = int(x)
        if 1 <= val <= 4:
            return val
        else:
            return None
    except:
        return None

def parse_handle_16_20(x):
    """
    Valid range is 1 to 4 inclusive.
    Otherwise, treat as no answer.
    """
    try:
        val = int(x)
        if 1 <= val <= 4:
            return val
        else:
            return None
    except:
        return None

# ============================================================
# 2. Map/Transform 2016 & 2020 variables for new categories
# ============================================================

# --- Age ---
df["16Age"] = df["V161267"].apply(parse_age)
df["20Age"] = df["V201507x"].apply(parse_age)

# --- Income ---
df["16Income"] = df["V161361x"].apply(parse_income)
df["20Income"] = df["V201617x"].apply(parse_income)

# --- Marriage ---
df["16Marriage"] = df["V161268"].apply(parse_marriage_16_20)
df["20Marriage"] = df["V201508"].apply(parse_marriage_16_20)

# --- Social Media usage (socmed) ---
df["16SocMed"] = df["V162257"].apply(parse_socmed_16_20)
df["20SocMed"] = df["V202407"].apply(parse_socmed_16_20)

# --- Handle Health Care ---
df["16HandleHealth"] = df["V161085x"].apply(parse_handle_16_20)
df["20HandleHealth"] = df["V201138x"].apply(parse_handle_16_20)

# --- Handle Immigration ---
df["16HandleImmig"] = df["V161084x"].apply(parse_handle_16_20)
df["20HandleImmig"] = df["V201141x"].apply(parse_handle_16_20)

# ============================================================
# 3. Add these to the previously featured subset
# ============================================================

# Example of the original subset columns from previous steps:
prev_subset_columns = [
    "16Vote", "16StrongVote", "16Closer", "16VoteSum",
    "16GunHarder", "16GunImportance", "16GunHowMany",
    "20Vote", "20StrongVote", "20Closer", "20VoteSum",
    "20GunHarder", "20GunImportance", "20GunHowMany"
]

# Now add the new columns to create an updated subset
new_columns = [
    "16Age", "20Age", 
    "16Income", "20Income", 
    "16Marriage", "20Marriage", 
    "16SocMed", "20SocMed",
    "16HandleHealth", "20HandleHealth",
    "16HandleImmig", "20HandleImmig"
]

combined_columns = prev_subset_columns + new_columns

df_extended_subset = df[combined_columns]

# ============================================================
# 4. Save the extended subset
# ============================================================
df_extended_subset.to_csv("vote_gun_with_demo.csv", index=False)

print("Saved extended subset to 'vote_gun_with_demo.csv'")

Saved extended subset to 'vote_gun_with_demo.csv'


In [34]:
df_extended_subset.head()

Unnamed: 0,16Vote,16StrongVote,16Closer,16VoteSum,16GunHarder,16GunImportance,16GunHowMany,20Vote,20StrongVote,20Closer,...,16Income,20Income,16Marriage,20Marriage,16SocMed,20SocMed,16HandleHealth,20HandleHealth,16HandleImmig,20HandleImmig
0,Republican,Strong,Inapplicable,Strong Republican,Same as now,Important,10.0,Republican,Strong,Inapplicable,...,13.0,13.0,1.0,1.0,2.0,3.0,4.0,1.0,4.0,1.0
1,Republican,Weak,Inapplicable,Not very strong Republican,Same as now,Most important,0.0,Independent,No answer,Neither,...,17.0,8.0,6.0,6.0,2.0,1.0,4.0,4.0,,1.0
2,No answer,No answer,Republican,Independent-Republican,Same as now,Most important,,Other,No answer,Republican,...,20.0,14.0,1.0,1.0,2.0,3.0,4.0,1.0,3.0,1.0
3,Independent,No answer,Democrat,Independent-Democrat,More strict,Most important,0.0,Independent,No answer,Neither,...,3.0,3.0,4.0,4.0,3.0,2.0,4.0,4.0,3.0,4.0
4,Democrat,Strong,Inapplicable,Strong Democrat,More strict,Most important,0.0,Democrat,Strong,Inapplicable,...,22.0,13.0,1.0,1.0,3.0,2.0,3.0,4.0,2.0,4.0


In [36]:
import pandas as pd
import numpy as np

# Assume df is already loaded (e.g., from a CSV or another source).

# =============================================================================
# 1. Define the parser for opinions in [0, 100]
# =============================================================================
def parse_opinion(x):
    """
    Expects values 0..100 (inclusive). Any other value -> None.
    Attempts to parse as integer; if parsing fails or out of range, return None.
    """
    try:
        val = int(x)
        if 0 <= val <= 100:
            return val
        else:
            return None
    except:
        return None

# =============================================================================
# 2. Apply to 2016 opinions
# =============================================================================

df["16Fundamentalist"] = df["V162095"].apply(parse_opinion)
df["16Feminist"]       = df["V162096"].apply(parse_opinion)
df["16Liberal"]        = df["V162097"].apply(parse_opinion)
df["16Union"]          = df["V162098"].apply(parse_opinion)
df["16BigBusiness"]    = df["V162100"].apply(parse_opinion)
df["16Conservatives"]  = df["V162101"].apply(parse_opinion)
df["16Gay"]            = df["V162103"].apply(parse_opinion)
df["16Congress"]       = df["V162104"].apply(parse_opinion)
df["16Muslims"]        = df["V162106"].apply(parse_opinion)
df["16Jews"]           = df["V162108"].apply(parse_opinion)
df["16Christ"]         = df["V162107"].apply(parse_opinion)
df["16Police"]         = df["V162110"].apply(parse_opinion)
df["16Transgender"]    = df["V162111"].apply(parse_opinion)
df["16Scientist"]      = df["V162112"].apply(parse_opinion)
df["16Blm"]            = df["V162113"].apply(parse_opinion)

# =============================================================================
# 3. Apply to 2020 opinions
# =============================================================================

df["20Fundamentalist"] = df["V202159"].apply(parse_opinion)
df["20Feminist"]       = df["V202160"].apply(parse_opinion)
df["20Liberal"]        = df["V202161"].apply(parse_opinion)
df["20Union"]          = df["V202162"].apply(parse_opinion)
df["20BigBusiness"]    = df["V202163"].apply(parse_opinion)
df["20Conservatives"]  = df["V202164"].apply(parse_opinion)
df["20Gay"]            = df["V202166"].apply(parse_opinion)
df["20Congress"]       = df["V202167"].apply(parse_opinion)
df["20Muslims"]        = df["V202168"].apply(parse_opinion)
df["20Jews"]           = df["V202170"].apply(parse_opinion)
df["20Christ"]         = df["V202169"].apply(parse_opinion)
df["20Police"]         = df["V202171"].apply(parse_opinion)
df["20Transgender"]    = df["V202172"].apply(parse_opinion)
df["20Scientist"]      = df["V202173"].apply(parse_opinion)
df["20Blm"]            = df["V202174"].apply(parse_opinion)

# =============================================================================
# 4. Combine new opinion columns with the existing subset
# =============================================================================

# Example: if you have previously stored the subset columns in `prev_subset_columns`
# and the newly added demographic/policy columns in `new_columns`,
# you can add these new "opinion" columns to create a final expanded subset.

prev_subset_columns = [
    # Previously used vote & gun columns...
    "16Vote", "16StrongVote", "16Closer", "16VoteSum",
    "16GunHarder", "16GunImportance", "16GunHowMany",
    "20Vote", "20StrongVote", "20Closer", "20VoteSum",
    "20GunHarder", "20GunImportance", "20GunHowMany"
]

# Example from before: newly added demographics/policy columns
new_columns = [
    "16Age", "20Age", 
    "16Income", "20Income", 
    "16Marriage", "20Marriage", 
    "16SocMed", "20SocMed",
    "16HandleHealth", "20HandleHealth",
    "16HandleImmig", "20HandleImmig"
]

# Now define the opinion columns:
opinion_columns_16_20 = [
    "16Fundamentalist", "16Feminist", "16Liberal", "16Union", "16BigBusiness",
    "16Conservatives", "16Gay", "16Congress", "16Muslims", "16Jews",
    "16Christ", "16Police", "16Transgender", "16Scientist", "16Blm",
    "20Fundamentalist", "20Feminist", "20Liberal", "20Union", "20BigBusiness",
    "20Conservatives", "20Gay", "20Congress", "20Muslims", "20Jews",
    "20Christ", "20Police", "20Transgender", "20Scientist", "20Blm"
]

# Combine them all
combined_columns = prev_subset_columns + new_columns + opinion_columns_16_20

# Build the final extended subset
df_final_subset = df[combined_columns]

# =============================================================================
# 5. Save the final subset
# =============================================================================
df_final_subset.to_csv("vote_gun_demo_opinions.csv", index=False)

print("Saved final extended subset to 'vote_gun_demo_opinions.csv'.")

Saved final extended subset to 'vote_gun_demo_opinions.csv'.


In [37]:
df_final_subset.head()

Unnamed: 0,16Vote,16StrongVote,16Closer,16VoteSum,16GunHarder,16GunImportance,16GunHowMany,20Vote,20StrongVote,20Closer,...,20Conservatives,20Gay,20Congress,20Muslims,20Jews,20Christ,20Police,20Transgender,20Scientist,20Blm
0,Republican,Strong,Inapplicable,Strong Republican,Same as now,Important,10.0,Republican,Strong,Inapplicable,...,85.0,60.0,50.0,30.0,70.0,100.0,100.0,50.0,100.0,15.0
1,Republican,Weak,Inapplicable,Not very strong Republican,Same as now,Most important,0.0,Independent,No answer,Neither,...,100.0,50.0,50.0,50.0,50.0,50.0,70.0,50.0,100.0,0.0
2,No answer,No answer,Republican,Independent-Republican,Same as now,Most important,,Other,No answer,Republican,...,50.0,0.0,60.0,50.0,50.0,90.0,80.0,0.0,50.0,0.0
3,Independent,No answer,Democrat,Independent-Democrat,More strict,Most important,0.0,Independent,No answer,Neither,...,50.0,100.0,60.0,100.0,100.0,50.0,60.0,100.0,85.0,85.0
4,Democrat,Strong,Inapplicable,Strong Democrat,More strict,Most important,0.0,Democrat,Strong,Inapplicable,...,50.0,85.0,70.0,85.0,85.0,85.0,70.0,50.0,15.0,100.0


In [41]:
df_final_subset.columns

Index(['16Vote', '16StrongVote', '16Closer', '16VoteSum', '16GunHarder',
       '16GunImportance', '16GunHowMany', '20Vote', '20StrongVote', '20Closer',
       '20VoteSum', '20GunHarder', '20GunImportance', '20GunHowMany', '16Age',
       '20Age', '16Income', '20Income', '16Marriage', '20Marriage', '16SocMed',
       '20SocMed', '16HandleHealth', '20HandleHealth', '16HandleImmig',
       '20HandleImmig', '16Fundamentalist', '16Feminist', '16Liberal',
       '16Union', '16BigBusiness', '16Conservatives', '16Gay', '16Congress',
       '16Muslims', '16Jews', '16Christ', '16Police', '16Transgender',
       '16Scientist', '16Blm', '20Fundamentalist', '20Feminist', '20Liberal',
       '20Union', '20BigBusiness', '20Conservatives', '20Gay', '20Congress',
       '20Muslims', '20Jews', '20Christ', '20Police', '20Transgender',
       '20Scientist', '20Blm'],
      dtype='object')

In [42]:
df_final_subset.describe()

Unnamed: 0,16GunHowMany,20GunHowMany,16Age,20Age,16Income,20Income,16Marriage,20Marriage,16SocMed,20SocMed,...,20Conservatives,20Gay,20Congress,20Muslims,20Jews,20Christ,20Police,20Transgender,20Scientist,20Blm
count,2686.0,2624.0,2759.0,2747.0,2724.0,2661.0,2822.0,2826.0,2834.0,2652.0,...,2619.0,2630.0,2644.0,2624.0,2623.0,2638.0,2650.0,2626.0,2645.0,2637.0
mean,1.432986,1.548399,49.455962,52.859847,16.157122,11.993611,2.874557,2.722222,2.190896,2.147436,...,54.787323,64.9,44.213691,57.616616,72.822722,72.02464,71.772075,58.744478,78.875236,51.380736
std,4.170547,5.312135,17.034521,16.566437,7.847302,6.635598,2.116728,2.053527,0.805848,0.813724,...,28.225116,26.343841,21.820635,24.295803,21.864978,24.799181,24.478697,27.14619,20.098893,35.409933
min,0.0,0.0,18.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,35.0,39.0,11.0,6.0,1.0,1.0,2.0,2.0,...,40.0,50.0,30.0,50.0,50.0,50.0,60.0,50.0,70.0,15.0
50%,0.0,0.0,50.0,54.0,17.0,12.0,1.0,1.0,2.0,2.0,...,50.0,60.0,50.0,50.0,70.0,75.0,75.0,50.0,85.0,60.0
75%,1.0,1.0,63.0,67.0,23.0,18.0,5.0,4.0,3.0,3.0,...,80.0,85.0,60.0,70.0,90.0,100.0,88.75,85.0,100.0,85.0
max,99.0,99.0,90.0,80.0,28.0,22.0,6.0,6.0,4.0,4.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [44]:
deneme = df_final_subset.dropna()

In [45]:
deneme.describe()

Unnamed: 0,16GunHowMany,20GunHowMany,16Age,20Age,16Income,20Income,16Marriage,20Marriage,16SocMed,20SocMed,...,20Conservatives,20Gay,20Congress,20Muslims,20Jews,20Christ,20Police,20Transgender,20Scientist,20Blm
count,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,...,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0,1928.0
mean,1.356846,1.594398,48.282158,51.754149,16.612552,12.310685,2.947095,2.769191,2.149896,2.143154,...,52.839212,66.820539,44.179979,58.891598,73.17168,70.592324,70.221473,60.588174,79.850104,53.452801
std,3.546598,5.470723,16.746631,16.387964,7.70592,6.555218,2.150609,2.086143,0.792963,0.817076,...,28.58531,26.346558,21.686167,24.107314,21.929028,25.343756,24.996901,27.28052,19.941764,35.179012
min,0.0,0.0,18.0,19.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,34.0,38.0,11.0,7.0,1.0,1.0,2.0,2.0,...,30.0,50.0,30.0,50.0,50.0,50.0,60.0,50.0,70.0,15.0
50%,0.0,0.0,49.0,53.0,17.0,13.0,1.0,1.0,2.0,2.0,...,50.0,70.0,49.5,50.0,75.0,70.0,70.0,50.0,85.0,60.0
75%,1.0,2.0,62.0,65.0,23.0,18.0,6.0,4.0,3.0,3.0,...,70.0,86.25,60.0,75.0,90.0,95.0,85.0,85.0,100.0,85.0
max,49.0,99.0,90.0,80.0,28.0,22.0,6.0,6.0,4.0,4.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0


In [None]:
 NaN rate is at most 7% per column and zero for many of them, however if we drop all nan, data reduces down to 1.9k participant. So maybe a way to overcome this drop or at least making it smaller will be good. 

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assume df is already loaded:
# df = pd.read_csv("your_data.csv")

# -------------------------------------------------------------------
# 1. Basic Overview
# -------------------------------------------------------------------
print("=== DataFrame Info ===")
print(df.info())

print("\n=== Basic Descriptive Statistics (all columns) ===")
print(df.describe(include="all"))

print("\n=== Missing Values by Column ===")
print(df.isna().sum())

# -------------------------------------------------------------------
# 2. Categorical Variables Analysis
#    We'll consider the "Vote" and "StrongVote" columns as categorical.
#    Feel free to add more columns if you know they are categorical.
# -------------------------------------------------------------------
categorical_cols = [
    "16Vote", "16StrongVote", "16Closer", "16VoteSum",
    "20Vote", "20StrongVote", "20Closer", "20VoteSum",
    "16Marriage", "20Marriage"
    # etc., add more if you treat them as categorical
]

for col in categorical_cols:
    # Value counts
    print(f"\n=== Value Counts: {col} ===")
    print(df[col].value_counts(dropna=False))
    
    # Bar plot
    plt.figure(figsize=(6,4))
    df[col].value_counts(dropna=False).plot(kind='bar', color='skyblue')
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.tight_layout()
    plt.savefig(f"{col}_barplot.png")
    plt.close()

# -------------------------------------------------------------------
# 3. Numeric Variables Analysis
#    Many of these are 0-100 thermometers, or discrete counts, or ages/income.
#    Adjust this list depending on your knowledge of which are numeric.
# -------------------------------------------------------------------
numeric_cols = [
    "16GunHowMany", "20GunHowMany", 
    "16Age", "20Age", 
    "16Income", "20Income", 
    "16SocMed", "20SocMed",
    "16HandleHealth", "20HandleHealth",
    "16HandleImmig", "20HandleImmig",
    "16Fundamentalist", "16Feminist", "16Liberal", "16Union", "16BigBusiness",
    "16Conservatives", "16Gay", "16Congress", "16Muslims", "16Jews",
    "16Christ", "16Police", "16Transgender", "16Scientist", "16Blm",
    "20Fundamentalist", "20Feminist", "20Liberal", "20Union", "20BigBusiness",
    "20Conservatives", "20Gay", "20Congress", "20Muslims", "20Jews",
    "20Christ", "20Police", "20Transgender", "20Scientist", "20Blm"
]

# Describe numeric columns
print("\n=== Descriptive Statistics for Numeric Columns ===")
print(df[numeric_cols].describe())

# Histograms & Boxplots
for col in numeric_cols:
    # Histogram
    plt.figure(figsize=(6,4))
    df[col].dropna().hist(bins=20, color='skyblue', edgecolor='black')
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(f"{col}_hist.png")
    plt.close()
    
    # Boxplot
    plt.figure(figsize=(3,5))
    sns.boxplot(y=df[col], color='lightgreen')
    plt.title(f"Boxplot of {col}")
    plt.ylabel(col)
    plt.tight_layout()
    plt.savefig(f"{col}_box.png")
    plt.close()

# -------------------------------------------------------------------
# 4. Cross-Tab: 2016 Vote vs. 2020 Vote
#    This is very useful to see if there's vote switching.
# -------------------------------------------------------------------
print("\n=== Cross-Tab: 16Vote vs. 20Vote ===")
vote_crosstab = pd.crosstab(df["16Vote"], df["20Vote"], dropna=False)
print(vote_crosstab)

# Let's plot the cross-tab as a heatmap
plt.figure(figsize=(6,4))
sns.heatmap(vote_crosstab, annot=True, cmap="Blues", fmt="d")
plt.title("Cross-tab of 16Vote vs 20Vote")
plt.xlabel("20Vote")
plt.ylabel("16Vote")
plt.tight_layout()
plt.savefig("16vs20_vote_crosstab.png")
plt.close()

# -------------------------------------------------------------------
# 5. Correlation Matrix for Numeric Variables
#    This can reveal how attitudes might cluster together.
# -------------------------------------------------------------------
# Filter out columns that have at least some numeric data
df_numeric = df[numeric_cols].dropna(axis=0, how="all")  # rows that are entirely NaN won't help
corr_matrix = df_numeric.corr()

plt.figure(figsize=(10,8))
sns.heatmap(corr_matrix, cmap="coolwarm", center=0, square=True)
plt.title("Correlation Heatmap (Numeric Columns)")
plt.tight_layout()
plt.savefig("numeric_correlation_heatmap.png")
plt.close()

# -------------------------------------------------------------------
# 6. Example: Compare 16 vs. 20 versions of the same variable
#    For instance, Age is obvious (should be +4 if consistent),
#    but let's do a scatter plot or pair plot for a few variables.
# -------------------------------------------------------------------
compare_pairs = [("16Liberal", "20Liberal"), ("16GunHowMany", "20GunHowMany")]
for (col16, col20) in compare_pairs:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=df[col16], y=df[col20], alpha=0.5)
    plt.title(f"Scatter Plot of {col16} vs. {col20}")
    plt.xlabel(col16)
    plt.ylabel(col20)
    plt.tight_layout()
    plt.savefig(f"{col16}_vs_{col20}_scatter.png")
    plt.close()


=== DataFrame Info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2839 entries, 0 to 2838
Columns: 3668 entries, version2016 to 20Blm
dtypes: float64(3593), int64(12), object(63)
memory usage: 79.4+ MB
None

=== Basic Descriptive Statistics (all columns) ===
                        version2016      V160001   V160001_orig      V160101  \
count                          2839  2839.000000    2839.000000  2839.000000   
unique                            1          NaN            NaN          NaN   
top     ANES2016TimeSeries_20190904          NaN            NaN          NaN   
freq                           2839          NaN            NaN          NaN   
mean                            NaN  2828.636492  378605.299753     0.995603   
std                             NaN  1503.975076   44271.572388     0.659564   
min                             NaN     1.000000  300001.000000     0.099300   
25%                             NaN  2011.000000  400026.500000     0.576450   
50%           

In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assume df is already loaded
# df = pd.read_csv("your_data.csv")

# -----------------------------------------------------------------------------
# 1. Compare Numeric Variables (16 vs 20) on the Same Figure
#    E.g. overlay histograms for "16GunHowMany" vs. "20GunHowMany", etc.
# -----------------------------------------------------------------------------

numeric_pairs = [
    ("16GunHowMany", "20GunHowMany"),
    ("16Age", "20Age"),
    ("16Income", "20Income"),
    ("16SocMed", "20SocMed"),
    ("16HandleHealth", "20HandleHealth"),
    ("16HandleImmig", "20HandleImmig"),
    # Attitude thermometers (0–100)
    ("16Liberal", "20Liberal"),
    ("16Conservatives", "20Conservatives"),
    ("16Police", "20Police"),
    ("16Blm", "20Blm"),
    # ...add more if you like
]

for (col_16, col_20) in numeric_pairs:
    # Overlay Histograms
    plt.figure(figsize=(6,4))
    df[col_16].dropna().hist(alpha=0.5, bins=20, color='blue', label=col_16)
    df[col_20].dropna().hist(alpha=0.5, bins=20, color='orange', label=col_20)
    plt.title(f"Histogram Comparison: {col_16} vs. {col_20}")
    plt.xlabel("Value")
    plt.ylabel("Frequency")
    plt.legend()
    plt.tight_layout()
    plt.savefig(f"hist_comparison_{col_16}_vs_{col_20}.png")
    plt.close()
    
    # Side-by-side Boxplots
    plt.figure(figsize=(4,5))
    data_for_box = pd.DataFrame({
        col_16: df[col_16],
        col_20: df[col_20]
    })
    sns.boxplot(data=data_for_box, palette=["blue", "orange"])
    plt.title(f"Boxplot Comparison: {col_16} & {col_20}")
    plt.ylabel("Value")
    plt.savefig(f"box_comparison_{col_16}_vs_{col_20}.png")
    plt.close()

# -----------------------------------------------------------------------------
# 2. Compare Categorical Variables (16 vs 20) on the Same Figure
#    E.g. "16Vote" vs "20Vote", "16Marriage" vs "20Marriage"
# -----------------------------------------------------------------------------

categorical_pairs = [
    ("16Vote", "20Vote"),
    ("16Marriage", "20Marriage"),
    ("16StrongVote", "20StrongVote"),
    # etc., add more pairs if appropriate
]

for (col_16, col_20) in categorical_pairs:
    # Count distributions for each year
    dist_16 = df[col_16].value_counts(dropna=False)
    dist_20 = df[col_20].value_counts(dropna=False)
    
    # Combine into a single DataFrame for side-by-side bar chart
    combined_df = pd.DataFrame({
        "2016": dist_16,
        "2020": dist_20
    }).fillna(0)
    
    # Sort indexes if you want consistent ordering
    # combined_df = combined_df.sort_index()
    
    plt.figure(figsize=(8,4))
    combined_df.plot(kind="bar", ax=plt.gca())
    plt.title(f"Comparison of {col_16} vs. {col_20}")
    plt.xlabel("Categories")
    plt.ylabel("Count")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"bar_comparison_{col_16}_vs_{col_20}.png")
    plt.close()

# -----------------------------------------------------------------------------
# 3. Bonus: Directly Plot Differences (If Desired)
#    Example: "20Liberal - 16Liberal"
# -----------------------------------------------------------------------------
# We can create "difference" columns for certain numeric variables and plot them.

difference_pairs = [
    ("16Liberal", "20Liberal"),
    ("16Blm", "20Blm"),
    # Add more if you want difference plots
]

for (col_16, col_20) in difference_pairs:
    diff_col = f"diff_{col_20}"
    df[diff_col] = df[col_20] - df[col_16]
    
    plt.figure(figsize=(6,4))
    df[diff_col].dropna().hist(bins=20, color='purple', edgecolor='black')
    plt.title(f"Distribution of {diff_col} (i.e. {col_20} - {col_16})")
    plt.xlabel("Difference")
    plt.ylabel("Frequency")
    plt.tight_layout()
    plt.savefig(f"hist_{diff_col}.png")
    plt.close()

print("Comparison plots saved. Check your PNG files!")

Comparison plots saved. Check your PNG files!


In [48]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assume df is already loaded.

# 1. Define all 2016-vs-2020 pairs
difference_pairs = [
    ('16Vote', '20Vote'),
    ('16StrongVote', '20StrongVote'),
    ('16Closer', '20Closer'),
    ('16VoteSum', '20VoteSum'),
    ('16GunHarder', '20GunHarder'),
    ('16GunImportance', '20GunImportance'),
    ('16GunHowMany', '20GunHowMany'),
    ('16Age', '20Age'),
    ('16Income', '20Income'),
    ('16Marriage', '20Marriage'),
    ('16SocMed', '20SocMed'),
    ('16HandleHealth', '20HandleHealth'),
    ('16HandleImmig', '20HandleImmig'),
    ('16Fundamentalist', '20Fundamentalist'),
    ('16Feminist', '20Feminist'),
    ('16Liberal', '20Liberal'),
    ('16Union', '20Union'),
    ('16BigBusiness', '20BigBusiness'),
    ('16Conservatives', '20Conservatives'),
    ('16Gay', '20Gay'),
    ('16Congress', '20Congress'),
    ('16Muslims', '20Muslims'),
    ('16Jews', '20Jews'),
    ('16Christ', '20Christ'),
    ('16Police', '20Police'),
    ('16Transgender', '20Transgender'),
    ('16Scientist', '20Scientist'),
    ('16Blm', '20Blm')
]

# 2. Loop over each pair, compute the difference, and plot histogram
for (col_16, col_20) in difference_pairs:
    diff_col = f"Diff_{col_20}_minus_{col_16}"
    
    # Create the difference: (2020) - (2016)
    # This only works meaningfully if both columns are numeric!
    # If they are strings or categories, you may need to encode them first.
    try:
        df[diff_col] = df[col_20].astype(float) - df[col_16].astype(float)
    except ValueError as e:
        print(f"Skipping difference for {col_16} vs. {col_20} because of non-numeric data.")
        continue
    
    # Drop missing or invalid values
    diff_data = df[diff_col].dropna()

    # If we have no valid data, skip
    if diff_data.empty:
        print(f"No valid numeric data for {col_16} vs. {col_20}; skipping.")
        continue
    
    # 3. Plot histogram
    plt.figure(figsize=(6,4))
    sns.histplot(diff_data, kde=False, bins=20, color='purple', edgecolor='black')
    plt.axvline(x=0, color='red', linestyle='--', label='No Change')
    plt.title(f"Histogram of {diff_col}\n(2020 minus 2016)")
    plt.xlabel("Difference")
    plt.ylabel("Count")
    plt.legend()
    plt.tight_layout()
    
    # 4. Save figure
    fname = f"{diff_col}_hist.png"
    plt.savefig(fname)
    plt.close()

print("Difference histograms attempted for all 2016-vs-2020 pairs. Check PNG files.")

Skipping difference for 16Vote vs. 20Vote because of non-numeric data.
Skipping difference for 16StrongVote vs. 20StrongVote because of non-numeric data.
Skipping difference for 16Closer vs. 20Closer because of non-numeric data.
Skipping difference for 16VoteSum vs. 20VoteSum because of non-numeric data.
Skipping difference for 16GunHarder vs. 20GunHarder because of non-numeric data.
Skipping difference for 16GunImportance vs. 20GunImportance because of non-numeric data.
Difference histograms attempted for all 2016-vs-2020 pairs. Check PNG files.
