In [1]:
import os
import numpy as np
import pandas as pd
from congress import Congress
import datetime as DT
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

## Getting congress data

In [2]:
congress = Congress('rsDVQCxE9vexRoqau9VTUMnRgxRHkYyDzPX1I6QZ')
house = congress.members.filter('house') # or house
df_house = pd.DataFrame.from_dict(house[0]['members'])

In [3]:
df_house.columns

Index(['api_uri', 'at_large', 'contact_form', 'crp_id', 'cspan_id',
       'date_of_birth', 'district', 'dw_nominate', 'facebook_account', 'fax',
       'fec_candidate_id', 'first_name', 'gender', 'geoid', 'google_entity_id',
       'govtrack_id', 'icpsr_id', 'id', 'ideal_point', 'in_office',
       'last_name', 'leadership_role', 'middle_name', 'missed_votes',
       'missed_votes_pct', 'next_election', 'ocd_id', 'office', 'party',
       'phone', 'rss_url', 'seniority', 'short_title', 'state', 'suffix',
       'title', 'total_present', 'total_votes', 'twitter_account', 'url',
       'votes_with_party_pct', 'votesmart_id', 'youtube_account'],
      dtype='object')

## Data preperation

In [20]:
# Selecting columns
df_house['fullname'] = df_house.first_name + " " + df_house.last_name
df1 = df_house[['id','gender','date_of_birth','fullname','state','twitter_account']]

# Finding age
now = pd.Timestamp(DT.datetime.now())
df1['date_of_birth'] = pd.to_datetime(df1['date_of_birth'])    # 1
df1['date_of_birth'] = df1['date_of_birth'].where(df1['date_of_birth'] < now, df1['date_of_birth'] -  np.timedelta64(100, 'Y'))   # 2
df1['age'] = (now - df1['date_of_birth']).astype('<m8[Y]')
df1.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,id,gender,date_of_birth,fullname,state,twitter_account,age
0,A000374,M,1954-09-16,Ralph Abraham,LA,RepAbraham,63.0
1,A000370,F,1946-05-27,Alma Adams,NC,RepAdams,71.0
2,A000055,M,1965-07-22,Robert Aderholt,AL,Robert_Aderholt,52.0
3,A000371,M,1979-06-19,Pete Aguilar,CA,reppeteaguilar,38.0
4,A000372,M,1951-11-07,Rick Allen,GA,reprickallen,66.0


In [73]:
# Creating list of Representative with no twitter profile
df1.id[df1.twitter_account.isnull()]

5      A000367
32     B001245
68     C001049
76     C001108
148    G000584
228    L000588
238    L000589
262    M001198
316    P000258
342    R000409
361    S001177
Name: id, dtype: object

## Loading data for democrats and republicans

In [40]:
# Loading the files for democrats:
os.chdir('C:/Users/Asger/Dropbox/Økonomi/Topics in Social Data Science/Projekt/democrats/')

data_dem= []
path = r'C:/Users/Asger/Dropbox/Økonomi/Topics in Social Data Science/Projekt/democrats/'
for root, dirs, files  in os.walk(path, topdown=False):
    for i in files:
        data_dem.append(pd.read_csv(i))
    
df_dem = pd.concat(data_dem)
print(len(df_dem))
df_dem.head()

19306


Unnamed: 0.1,Unnamed: 0,agree_percent,common_votes,congress_nr,disagree_percent,disagree_votes,member1_id,member2_id
0,14554,95.79,832,115,4.21,35,L000560,L000557
1,14555,96.37,853,115,3.63,31,L000560,L000581
2,14556,93.67,806,115,6.33,51,L000560,L000586
3,14557,92.16,867,115,7.84,68,L000560,L000551
4,14558,95.56,879,115,4.44,39,L000560,L000263


In [27]:
# Loading the files for republicans:
os.chdir('C:/Users/Asger/Dropbox/Økonomi/Topics in Social Data Science/Projekt/republicans/')

data_rep= []
path = r'C:/Users/Asger/Dropbox/Økonomi/Topics in Social Data Science/Projekt/republicans/'
for root, dirs, files  in os.walk(path, topdown=False):
    for i in files:
        data_rep.append(pd.read_csv(i))
    
df_rep = pd.concat(data_rep)
print(len(df_rep))
df_rep.head()

27092


Unnamed: 0.1,Unnamed: 0,agree_percent,common_votes,congress_nr,disagree_percent,disagree_votes,member1_id,member2_id
0,18651,94.17,858,115,5.83,50,H001073,I000056
1,18652,95.1,836,115,4.9,41,H001073,J000297
2,18653,95.83,863,115,4.17,36,H001073,J000290
3,18654,92.78,872,115,7.22,63,H001073,J000299
4,18655,96.47,878,115,3.53,31,H001073,J000292


## Creating data file for prediction

In [85]:
os.chdir('C:/Users/Asger/Dropbox/Økonomi/Topics in Social Data Science/Projekt/')

def data_preb(df_data):
    df_data = df_data[['agree_percent','member1_id','member2_id']]

    # merging data for Representative 1
    df_data=df_data.merge(df1, left_on='member1_id', right_on='id', how='left')
    df_data= df_data.drop(['id','date_of_birth'], axis=1)
    # Renaming the variables
    df_data=df_data.rename(index=str, columns={"gender": "gender_1", "fullname": "fullname_1","state": "state_1", "age": "age_1", "twitter_account": "twitter_account_1"})

    # merging data for Representative 2
    df_data=df_data.merge(df1, left_on='member2_id', right_on='id', how='left')
    df_data= df_data.drop(['id','date_of_birth'], axis=1)
    df_data=df_data.rename(index=str, columns={"gender": "gender_2", "fullname": "fullname_2","state": "state_2", "age": "age_2", "twitter_account": "twitter_account_2"})
    df_data.head()

    # Finding variable for same gender
    df_data['same_gender'] = np.where(df_data['gender_1']==df_data['gender_2'], 1, 0)
    df_data= df_data.drop(['gender_1','gender_2'], axis=1)

    #Finding variable for different age
    df_data['same_age'] = np.where(np.absolute(df_data['age_1']-df_data['age_2'])<10, 1, 0)
    df_data= df_data.drop(['age_1','age_2'], axis=1)

    # Finding variable for same state
    df_data['same_state'] = np.where(df_data['state_1']==df_data['state_2'], 1, 0)

    # Variable for naighbor state
    df_nbs = pd.read_csv('neighbors-states.csv',delimiter=';')
    nbs_tuples = list(df_nbs[['StateCode', 'NeighborStateCode']].apply(tuple, axis=1))

    nbs = []
    for i in range(0,len(df_data)):
        tup = (df_data.state_1[i],df_data.state_2[i])
        if tup in nbs_tuples:
            nbs.append(1)
        else:
            nbs.append(0)

    df_data['neighborState']=nbs
    df_data= df_data.drop(['state_1','state_2'], axis=1)

    # Variables from twitter network
    df_tw = pd.read_csv('jaccard_adamic_df.csv',delimiter=',')
    df_tw['dyad_1'] = df_tw.pol_two + "_" + df_tw.pol_one

    df_data['tw_ids']=df_data.twitter_account_1 +"_"+df_data.twitter_account_2
    df_data['tw_ids_1']=df_data.twitter_account_2 +"_"+df_data.twitter_account_1

    # First merge
    df_merge1=df_data.merge(df_tw,left_on='tw_ids', right_on='dyad',how='left')

    missing_jc_1 = []
    for i in range(0,len(df_merge1)):
        if pd.isna(df_merge1.jaccard[i]) == True:
            missing_jc_1.append(1)
        else:
            missing_jc_1.append(0)

    df_merge1=df_merge1.dropna(subset=['jaccard'])

    # second merge
    df_merge2=df_data.merge(df_tw,left_on='tw_ids', right_on='dyad_1',how='left')

    missing_jc_2 = []
    for i in range(0,len(df_merge2)):
        if pd.isna(df_merge2.jaccard[i]) == True:
            missing_jc_2.append(1)
        else:
            missing_jc_2.append(0)

    df_merge2=df_merge2.dropna(subset=['jaccard'])

    df_woZero = pd.concat([df_merge1,df_merge2])
    df_woZero = df_woZero.drop(['tw_ids','tw_ids_1','Unnamed: 0','pol_one','pol_two','dyad_1','dyad'], axis=1)

    # Making data set where jaccard is set to zero for those with Nan
    df_data['missing_jc_1'] = missing_jc_1
    df_data['missing_jc_2'] = missing_jc_2

    df_jc_mis = df_data.loc[(df_data['missing_jc_1'] == 1) & (df_data['missing_jc_2'] == 1)]
    df_jc_mis = df_jc_mis.drop(['tw_ids','tw_ids_1','missing_jc_1','missing_jc_2'], axis=1)
    df_jc_mis['ademic'] = 0
    df_jc_mis['jaccard'] = 0

    df_wZero = pd.concat([df_woZero,df_jc_mis])
    
    # Selecting data for which is in the final network
    df_tw_in_graph = pd.read_csv('all_twitter_ids_final_network.csv',delimiter=',')
    list_twitter_ids=list(df_tw_in_graph.twitter_id)
    df_wZero = df_wZero.loc[df_wZero['twitter_account_1'].isin(list_twitter_ids) & df_wZero['twitter_account_2'].isin(list_twitter_ids)]

    df_wZero = df_wZero.loc[df_wZero['twitter_account_1'].isin(list_twitter_ids) & df_wZero['twitter_account_2'].isin(list_twitter_ids)]

    # Dropping if agree_persent is missing:
    df_woZero=df_woZero.dropna(subset=['agree_percent'])
    df_wZero= df_wZero.dropna(subset=['agree_percent'])

    return df_woZero, df_wZero

## Getting the final data sets

In [91]:
df_dem_final_woZero, df_dem_final_wZero, = data_preb(df_dem)
df_rep_final_woZero, df_rep_final_wZero, = data_preb(df_rep)

In [92]:
# Printing to csv
df_dem_final_woZero.to_csv('df_dem_final_woZero.csv')
df_dem_final_wZero.to_csv('df_dem_final_wZero.csv')
df_rep_final_woZero.to_csv('df_rep_final_woZero.csv')
df_rep_final_wZero.to_csv('df_rep_final_wZero.csv')