In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt


In [None]:
file_path = 'anes.csv'
    
print(f"Loading data from {file_path}...")
df = pd.read_csv(file_path)
print("Data successfully loaded!\n")

print("Initial DataFrame shape:", df.shape)

In [None]:
# Display the first few rows
print("First 5 rows of the raw data:")
print(df.head())
print("\nData info (column types, non-null counts):")
print(df.info())

In [None]:
threshold = 0.7  # 80% missing
columns_to_drop = [col for col in df.columns 
                   if df[col].isnull().mean() > threshold]
if columns_to_drop:
    print(f"\nDropping columns with > {threshold*100}% missing values:")
    print(columns_to_drop)
    df.drop(columns=columns_to_drop, axis=1, inplace=True)
    print("New DataFrame shape:", df.shape)

In [None]:
print("\nBasic statistical summary (numerical columns):")
print(df.describe())

# B. Distribution of Key Variables
# Adjust these variable names to match actual columns of interest
variables_of_interest = ['age', 'income', 'ideology', 'vote_choice', 'VCF0705', 'VCF0706']
existing_vars = [var for var in variables_of_interest if var in df.columns]
valid_values = {0, 1, 2, 3, 4, 7}

# Create a filtered DataFrame
filtered_df = df[df[['VCF0705', 'VCF0706']].isin(valid_values).all(axis=1)]

print(filtered_df.head())

if existing_vars:
    print(f"\nPlotting histograms for the following variables: {existing_vars}")
    for var in existing_vars:
        plt.figure(figsize=(6, 4))
        sns.histplot(data=filtered_df, x=var, kde=True, color='blue')
        plt.title(f"Distribution of {var}")
        plt.tight_layout()
        plt.show()
else:
    print("\nNo matching variables found for distribution plotting. Check your column names.")

In [7]:
import pyreadstat
# Read the .sav file
df, meta = pyreadstat.read_sav("2016_2020_mergedpanel.sav")
# Export to CSV
df.to_csv("output.csv", index=False)

In [8]:
file_path = 'output.csv'
    
print(f"Loading data from {file_path}...")
df = pd.read_csv(file_path)
print("Data successfully loaded!\n")

print("Initial DataFrame shape:", df.shape)

Loading data from output.csv...
Data successfully loaded!

Initial DataFrame shape: (2839, 3612)


  df = pd.read_csv(file_path)


In [9]:
# Display the first few rows
print("First 5 rows of the raw data:")
print(df.head())


First 5 rows of the raw data:
                   version2016  V160001  V160001_orig  V160101  V160101f  \
0  ANES2016TimeSeries_20190904      1.0      300001.0   0.8270    0.8877   
1  ANES2016TimeSeries_20190904      2.0      300002.0   1.0806    1.1605   
2  ANES2016TimeSeries_20190904      4.0      300004.0   0.3596    0.3852   
3  ANES2016TimeSeries_20190904      5.0      300006.0   0.6470    0.6931   
4  ANES2016TimeSeries_20190904      7.0      300008.0   3.9604    4.2512   

   V160101w  V160102  V160102f  V160102w  V160201  ...  \
0       0.0   0.8420    0.9271       0.0    121.0  ...   
1       0.0   1.0133    1.0841       0.0    123.0  ...   
2       0.0   0.3663    0.4183       0.0    118.0  ...   
3       0.0   0.6463    0.7262       0.0    113.0  ...   
4       0.0   4.6151    4.7902       0.0    105.0  ...   

                     V203518  V203519           V203520  V203521  V203522  \
0           -1. Inapplicable     -1.0  -1. Inapplicable     -1.0     -1.0   
1  Stephen

In [10]:
print("\nData info (column types, non-null counts):")
print(df.info())


Data info (column types, non-null counts):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2839 entries, 0 to 2838
Columns: 3612 entries, version2016 to V203527
dtypes: float64(3549), int64(12), object(51)
memory usage: 78.2+ MB
None


In [11]:
threshold = 0.7  # 80% missing
columns_to_drop = [col for col in df.columns 
                   if df[col].isnull().mean() > threshold]
if columns_to_drop:
    print(f"\nDropping columns with > {threshold*100}% missing values:")
    print(columns_to_drop)
    df.drop(columns=columns_to_drop, axis=1, inplace=True)
    print("New DataFrame shape:", df.shape)


Dropping columns with > 70.0% missing values:
['V162084', 'V200012a', 'V200012b', 'V200012c', 'V200012d', 'V200013a', 'V200013b', 'V200013c', 'V200013d', 'V200014a', 'V200014b', 'V200014c', 'V200014d', 'V200015a', 'V200015b', 'V200015c', 'V200015d']
New DataFrame shape: (2839, 3595)


In [13]:
print("\nBasic statistical summary (numerical columns):")
print(df.describe())

# B. Distribution of Key Variables
# Adjust these variable names to match actual columns of interest
variables_of_interest = ['age', 'income', 'ideology', 'vote_choice', 'VCF0705', 'VCF0706']
existing_vars = [var for var in variables_of_interest if var in df.columns]
valid_values = {0, 1, 2, 3, 4, 7}

# Create a filtered DataFrame
#filtered_df = df[df[['VCF0705', 'VCF0706']].isin(valid_values).all(axis=1)]
filtered_df = df
print(filtered_df.head())

if existing_vars:
    print(f"\nPlotting histograms for the following variables: {existing_vars}")
    for var in existing_vars:
        plt.figure(figsize=(6, 4))
        sns.histplot(data=filtered_df, x=var, kde=True, color='blue')
        plt.title(f"Distribution of {var}")
        plt.tight_layout()
        plt.show()
else:
    print("\nNo matching variables found for distribution plotting. Check your column names.")


Basic statistical summary (numerical columns):
           V160001   V160001_orig      V160101     V160101f     V160101w  \
count  2839.000000    2839.000000  2839.000000  2839.000000  2839.000000   
mean   2828.636492  378605.299753     0.995603     0.257698     0.736337   
std    1503.975076   44271.572388     0.659564     0.589269     0.686505   
min       1.000000  300001.000000     0.099300     0.000000     0.000000   
25%    2011.000000  400026.500000     0.576450     0.000000     0.278400   
50%    3066.000000  402698.000000     0.826900     0.000000     0.673500   
75%    4089.500000  405259.500000     1.239600     0.000000     0.981850   
max    5087.000000  407796.000000     6.813900     7.308800     5.660000   

           V160102     V160102f     V160102w      V160201     V160201f  ...  \
count  2839.000000  2839.000000  2839.000000  2839.000000  2839.000000  ...   
mean      0.995416     0.254463     0.738672    67.734414     4.139486  ...   
std       0.665582     0.58129

In [29]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ------------------------------------------------------------------
# 1) Codebook-based dictionaries (Party ID)
#    Because your df has the columns renamed from the original VXXXXXX.
# ------------------------------------------------------------------
party_id_2016_map = {
    0: "No preference",
    1: "Democrat",
    2: "Republican",
    3: "Independent",
    5: "Other",
    -9: "Refused",
    -8: "Don’t know",
    -4: "Technical error"  # In case it appears, else it's ignored
}

party_id_2020_map = {
    0: "No preference",
    1: "Democrat",
    2: "Republican",
    3: "Independent",
    5: "Other",
    -9: "Refused",
    -8: "Don’t know",
    -4: "Technical error"
}

def decode_party_id(series, mapping):
    """
    Convert numeric codes in a Series into descriptive strings using 'mapping'.
    Missing/invalid codes (like -9, -8, -4) become NaN so they won't appear in the plot.
    """
    missing_codes = [-9, -8, -4]
    series = series.replace(missing_codes, np.nan)
    return series.map(mapping)

def step1_exploratory_analysis_with_decoding(df, figure_output_dir="figures"):
    """
    STEP 1 EDA for a Jupyter notebook:
      1) Print basic dataset info (shape, columns, df.info, describe).
      2) Decode the 2016 and 2020 Party ID columns into descriptive text.
      3) Plot & save bar charts for these decoded party ID columns.
      4) Plot & save histograms for numeric columns of your choice.

    :param df: DataFrame with descriptive column names (e.g., "2016 Party ID (basic)", 
               "2020 Party ID (basic)", "2020 Age (summary)", etc.).
    :param figure_output_dir: directory in which to save PNG plots.
    """

    # ------------------------------------------------------
    # (A) Print Basic Dataset Info
    # ------------------------------------------------------
    print("\n--- BASIC DATASET INFO ---")
    print(f"Shape of dataset: {df.shape}")
    print("\nColumns in dataset:\n", df.columns.tolist())
    print("\nData Types and Memory Info:")
    print(df.info())
    print("\nSummary Statistics (numeric + object):")
    print(df.describe(include='all'))

    # Ensure figure directory exists
    os.makedirs(figure_output_dir, exist_ok=True)

    # ------------------------------------------------------
    # (B) Decode Party ID columns
    #     Replace these with the EXACT column names in your df.
    # ------------------------------------------------------
    col_2016_party_id = "2016 Party ID (basic)"
    col_2020_party_id = "2020 Party ID (basic)"

    # Make new columns with decoded categories
    # So you don't overwrite the original numeric-coded columns
    if col_2016_party_id in df.columns:
        df["2016 Party ID (decoded)"] = decode_party_id(df[col_2016_party_id], party_id_2016_map)

    if col_2020_party_id in df.columns:
        df["2020 Party ID (decoded)"] = decode_party_id(df[col_2020_party_id], party_id_2020_map)

    # ------------------------------------------------------
    # (C) Bar charts for decoded Party ID
    # ------------------------------------------------------
    categorical_cols = [
        "2016 Party ID (decoded)",
        "2020 Party ID (decoded)"
    ]

    for col in categorical_cols:
        if col in df.columns:
            data_series = df[col].dropna()
            if len(data_series) == 0:
                print(f"No valid (non-missing) data in column: {col}")
                continue

            plt.figure(figsize=(7, 5))
            order = data_series.value_counts().index
            sns.countplot(x=col, data=df, order=order)
            plt.title(f"Count of {col}")
            plt.xlabel(col)
            plt.ylabel("Count")
            plt.xticks(rotation=45, ha="right")

            filename = f"{figure_output_dir}/Bar_{col.replace(' ', '_').replace('(', '').replace(')', '')}.png"
            plt.savefig(filename, dpi=100, bbox_inches="tight")
            plt.close()
            print(f"Bar chart saved for {col}: {filename}")
        else:
            print(f"Categorical column not found in df: {col}")

    # ------------------------------------------------------
    # (D) Histograms for Numeric Columns
    #     Modify this list to match numeric columns in your df.
    # ------------------------------------------------------
    numeric_cols = [
        "2020 Age (summary)",
        # "2016 Lib-Con 7pt scale",   # Uncomment if numeric
        # Add more numeric columns here...
    ]

    for col in numeric_cols:
        if col in df.columns:
            data_series = df[col].dropna()
            if len(data_series) == 0:
                print(f"No valid (non-missing) data in numeric column: {col}")
                continue

            plt.figure(figsize=(7, 5))
            sns.histplot(data_series, kde=False, bins=20)
            plt.title(f"Distribution of {col}")
            plt.xlabel(col)
            plt.ylabel("Frequency")

            filename = f"{figure_output_dir}/Hist_{col.replace(' ', '_').replace('(', '').replace(')', '')}.png"
            plt.savefig(filename, dpi=100, bbox_inches="tight")
            plt.close()
            print(f"Histogram saved for {col}: {filename}")
        else:
            print(f"Numeric column not found in df: {col}")

In [30]:
df = pd.read_csv("anes_merged_subset.csv")

In [31]:
step1_exploratory_analysis_with_decoding(df, figure_output_dir="figures")


--- BASIC DATASET INFO ---
Shape of dataset: (2839, 62)

Columns in dataset:
 ['2016 Immigration policy', '2020 For whom R voted (pre/early)', '2020 Media: TV mention', '2016 Feeling Therm: Rep Party', '2016 Death penalty (summary)', '2016 Voted in Nov election (post)', '2020 Media: Radio mention', '2016 Media: TV news mention', '2020 Party ID (basic)', '2020 Lib-Con 7pt scale', '2020 Media: Internet mention', '2020 Gun stance (difficulty buying)', '2016 Forced Lib or Con', 'Original (2016) Westat Case ID', '2016 Race summary', '2016 Media: Newspapers mention', '2016 Attention to news', '2016 For whom R voted (post)', '2016 ACA (2010 health care) favor/oppose', '2020 For whom R voted (post)', '2020 Feeling Therm: Dem Party', '2016 Post-election weight (full)', '2016 Attention to politics', '2020 Attention to politics', '2020 Where R is registered (pre)', '2016 Case ID', '2016 Feeling Therm: Dem Party', '2020 Voted in Nov election (post)', '2020 Media: Newspapers mention', '2016 Party 

In [32]:
# Check missingness
missing_summary = df.isna().sum() / len(df) * 100
print("Percentage of missing values per column:\n", missing_summary.sort_values(ascending=False))

Percentage of missing values per column:
 Panel post-election weight (2016-2020)    5.952800
2016 Party ID (decoded)                   0.563579
2020 Party ID (decoded)                   0.387460
2016 Immigration policy                   0.000000
2016 Death penalty (summary)              0.000000
                                            ...   
2020 Education level                      0.000000
2016 Age group                            0.000000
2020 Feeling Therm: Rep Party             0.000000
2016 Voted for President (pre)            0.000000
2016 Mode of interview                    0.000000
Length: 64, dtype: float64


In [33]:
# Crosstab example
pd.crosstab(df["2020 Party ID (decoded)"], df["2016 Party ID (decoded)"], dropna=False)

2016 Party ID (decoded),Democrat,Independent,No preference,Other,Republican,NaN
2020 Party ID (decoded),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Democrat,801,140,4,16,36,4
Independent,128,534,9,39,112,4
Other,11,32,2,27,10,2
Republican,48,181,10,16,658,4
,2,5,1,1,0,2
