In [None]:
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

from matplotlib import pyplot as plt
plt.style.use('ggplot')


In [None]:
url = 'https://api.apispreadsheets.com/api/dataset/congressional-voting/'

r = requests.get(url)
data = r.json()
df = pd.DataFrame.from_dict(data['data'], orient='columns')
df.sample(5)

# Description

This data set includes votes for each of the U.S. House of Representatives Congressmen on the 16 key votes identified by the CQA in 1986. The CQA lists nine different types of votes: voted for, paired for, and announced for (these three simplified to y), voted against, paired against, and announced against (these three simplified to n), voted present, voted present to avoid conflict of interest, and did not vote or otherwise make a position known (these three simplified to an unknown disposition noted in the dataset as ?).

---
Missing values are filled in with '?' for nominal and -100000 for numerical attributes



In [None]:
df.info()

# How does Y look like?


In [None]:
palette = {'republican': 'red', 'democrat': 'blue'}
plt.figure(figsize=(20,10))
ax = sns.countplot(data=df, x='political_party', hue='political_party', palette=palette)
for p in ax.patches:
  height = p.get_height()
  if height > 0:
    ax.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/len(df)*100),
            ha="center") 
plt.show()

Immediately it is visible that democrats have advantage in numbers in comparison to republicans. It is very likely that this matter affects how people votes, so after deep analysis we will be able to predict if someone is democrat or republican by analysing their votes only.

In [None]:
cols = df.columns.drop('political_party')

def plot_countplots(columns):

  fig, axis = plt.subplots(len(columns) // 3,3, figsize=(20,30))
  for i in range(len(columns) // 3):
        for j in range(3):

          ax = sns.countplot(data=df, x=columns[i*3+j], hue='political_party', 
                           palette = palette, ax=axis[i][j], order=['y','n','?'])
          for p in ax.patches:
              height = p.get_height()
              if height > 0:
                ax.text(p.get_x()+p.get_width()/2.,
                        height + 3,
                        '{:1.2f}%'.format(height/len(df)*100),
                        ha="center")
  plt.show()

plot_countplots(cols)

- `physician_fee_freeze` is the most divisive feature
- `water_project_cost_sharing` is a feature which has very regular distribution

# **Correlation**

To check if our assuptions about most dividing feature are correct, we've decided to add correlation matrix. By doing it we are able to see which features have common distribution, so we don't need them in data analysis.


We have to map our date, because basicaly we've got string data there and correlation won't work on such data. That's why we use dictionary 'party' which changes democrat and republican parties into, respectively, 2 and -2. All other values are mapped by dictionary val which swaps n for -1, ? for 0, y for 1.

In [None]:
val = {'n':-1, '?':0, 'y':1}
party = {'democrat':2, 'republican':-2}

df_num=df.copy()
df_party = df_num['political_party'].copy()
df_num.drop('political_party', axis=1, inplace=True)

for column in df_num.columns:
  df_num[column] = df_num[column].map(val)

df_party = df_party.map(party)

df_num = pd.concat([df_num,df_party], axis=1)

df_num.head()

In [None]:
plt.figure(figsize=(26,13))
df_num_cor=df_num.iloc[:,0:18]
c= df_num_cor.corr()
sns.heatmap(c,cmap='PRGn',annot=True)

***Summary***

- As we can clearly see, `physician_fee_freeze` is the most correlated value with political_party which indicates that we were initially right with our assumptions. 
- We can also see that such variables like `adoption_of_the_budget_resolution`, `el_salvador_aid` and `education_spending` are also highly correlated with party.
- On the contrary we can exclude such variables like `water_project_cost_sharing` and `immigration`, because they are almost uncorrelated with the political party.