# ICPSR Dataset Archive

In [22]:
from IPython.display import display
from ipywidgets import interact, interactive, fixed, interact_manual, HBox, VBox, Label
import ipywidgets as widgets
import pandas as pd
import qgrid
pd.set_option('display.max_colwidth', -1)

datasets = pd.read_json('data/data_sets.json')

In [23]:
@interact

def showDatasets(By = ['Dataset Title', 'Dataset ID'], Search="ANES 1952"):
    column = {
        'Dataset Title': 'title',
        'Dataset ID': 'data_set_id'
    }
    slist = Search.split(',')
#     return slist
    try:
        if(By == 'Dataset Title'):
            return datasets[['data_set_id', 'unique_identifier','title', 'description']].loc[datasets[column[By]].str.contains('|'.join(slist))]
        else:
            tmp = datasets[['data_set_id', 'unique_identifier','title', 'description']].loc[datasets[column[By]].isin([int(n) for n in slist])]
            
            sorter = pd.DataFrame([int(n) for n in slist], columns=['data_set_id'])
            tmp = pd.merge(sorter, tmp, how='left', on='data_set_id')
            return tmp
    except Exception as e:
        pass
    

interactive(children=(Dropdown(description='By', options=('Dataset Title', 'Dataset ID'), value='Dataset Title…

In [24]:
import networkx as nx
import random
from networkx.readwrite import json_graph
import json

def read_json_file(filename):
    with open(filename, 'r') as f:
        js_graph = json.load(f)
    return json_graph.node_link_graph(js_graph)
def generatePairs(firstnode, allDatasets):
    return [(firstnode, i) for i in allDatasets]
def getJaccard(node, g):
    allDatasets = [i for i in g.nodes if(str(i).startswith('data'))]
    titleID = df_datasets.title_id[df_datasets.data_set_id == int(node.replace('data_', ''))]
    filterSim = list(df_datasets.data_set_id[df_datasets.title_id == titleID.iloc[0]])
    filterSim = ['data_'+str(i) for i in filterSim]
    
    filtered = [i for i in allDatasets if i not in filterSim]
    
    pairs = generatePairs(node, filtered)
    preds_jaccard = nx.jaccard_coefficient(g, pairs)
    res = []
    for u, v, p in preds_jaccard:
        if p > 0.0:
            res.append((u,v,p))
    return res

G = read_json_file('data/network_json.json')

titles = datasets.title.str.lower()
titles = titles.str.replace('[^a-zA-Z]', '')

df_titles = pd.DataFrame(set(titles)).reset_index()
df_titles.columns = ['title_id','title_unique']
df_titles.title_id = ['title_'+str(i) for i in df_titles.index]

df_datasets = datasets.copy()
df_datasets['title_unique'] = df_datasets['title'].str.lower()
df_datasets.title_unique = df_datasets.title_unique.str.replace('[^a-zA-Z]', '')
df_datasets = pd.merge(df_datasets, df_titles, on='title_unique', how='left')

In [25]:
btn = widgets.Button(description = "Generate")
display(Label('Press to generate random set of recommendations'), btn)
output = widgets.Output()

allDatasets = [i for i in G.nodes if(str(i).startswith('data'))]
N = 100

@output.capture()
def button_handler(btn):
    sampleDataset = random.sample(allDatasets, 1)
    for i in sampleDataset:
        tmp = getJaccard(i, G)
        print("Fetching dataset ID: %s \nCalculating similarity scores for %s/%s datasets"%(i, len(tmp),len(allDatasets)))
        if len(tmp)>0:
            res = getJaccard(i, G)
            df = pd.DataFrame(res, columns=['x', 'data_set_id', 'score']).sort_values(by=['score'], ascending=False).reset_index()
            df.data_set_id = [int(i.replace('data_', '')) for i in df.data_set_id]
            
            ids = df.data_set_id[:N]
            ids = df_datasets[df_datasets.data_set_id.isin(ids)]
            ids = ids.groupby('title_id').first().reset_index()
            
            res = pd.merge(ids, df, how='inner', on='data_set_id').sort_values(by=['score'], ascending=False).reset_index()
            display(res[['data_set_id', 'score']].iloc[:10])

            display(",".join(str(i) for i in res.data_set_id[:10]))

btn.on_click(button_handler)
display(output)
output.clear_output()

Label(value='Press to generate random set of recommendations')

Button(description='Generate', style=ButtonStyle())

Output()

In [20]:
df_datasets.head()

Unnamed: 0,additional_keywords,citation,coverages,data_set_id,date,description,family_identifier,identifier_list,mention_list,methodology,name,subjects,title,unique_identifier,title_unique,title_id
0,ICPSR,,,1,2016-09-20 00:00:00+00:00,"This study is part of a time-series collection of national surveys fielded continuously since 1948. The election studies are designed to present data on Americans' social backgrounds, enduring political predispositions, social and political values, perceptions and evaluations of groups and candidates, opinions on questions of public policy, and participation in political life. The 1952 National Election Study gauges political attitudes in general, along with attitudes and behaviors directly relevant to the 1952 presidential election. The interview schedule contained both closed and open-ended questions designed to collect data on a wide range of issues. Most respondents were interviewed both before and after the date of the election. The pre-election survey tapped attitudes toward political parties, candidates, and other specific issues, and inquired about the respondents' personal and political background. The post-election interview focused on the actual vote and voting-related behaviors. Additionally, a sub-sample of 585 respondents was administered a Form B re-interview obtaining further information about organizational affiliations, personal data, and non-political opinions and attitudes. A special emphasis was placed on the perception of group behavior, especially the perceived political preferences of family, friends, and associates.",,"[{'name': 'ICPSR data ID (dataId)', 'identifier': '10.3886/ICPSR07213'}]","[ANES study, ICPSR, SRC data, Surveys conducted by the Survey Research Center and the Center for Political Studies of the University, eight SRC-CPS presidential election surveys, eight SRC-CPS presidential election surveys con- ducted between 1952 and 1980, eight presidential election surveys conducted by the Survey Research Center and the Center for Political Studies (SRC-CPS), time series]",,ANES 1952 Time Series Study,"candidates,congressional elections,domestic policy,economic conditions,foreign policy,government performance,information sources,national elections,political affiliation,political attitudes,political campaigns,political efficacy,political issues,political participation,presidential elections,public approval,public opinion,special interest groups,Truman Administration (1945-1953),trust in government,voter expectations,voting behavior,United States,1952-09--1952-12",ANES 1952 Time Series Study,10.3886/ICPSR07213,anestimeseriesstudy,title_1012
1,ICPSR,,,2,2016-09-22 00:00:00+00:00,"This study is part of a time-series collection of national surveys fielded continuously since 1948. The election studies are designed to collect data on Americans' social backgrounds, enduring political predispositions, social and political values, perceptions and evaluations of groups and candidates, opinions on questions of public policy, and participation in political life. The questionnaires contained both closed and open-ended questions covering a wide range of topics. The study inquired about general political attitudes as well as the attitudes and behaviors pertinent to the 1956 presidential election. Each respondent was interviewed both before and after the election date. In the pre-election survey, respondents were asked about their attitudes toward political parties, candidates, and other specific issues, as well as personal data and some political history. The post-election interview focused on the actual vote and reasons for the vote. It also obtained further personal data and asked non-political attitudinal questions (Form C) of a sub-sample of 579 respondents.",,"[{'name': 'ICPSR data ID (dataId)', 'identifier': '10.3886/ICPSR07214'}]","[American national election studies, SRC-CPS congressional election surveys conducted be- tween 1958 and 1978, SRC-CPS presidential election studies, SRC-CPS presidential election surveys, SRC-CPS presidential election surveys con- ducted between 1952 and 1980, SRC-CPS surveys, Survey Research Center and the Center for Political Studies (SRC-CPS), Surveys conducted by the Survey Research Center and the Center for Political Studies of the University, surveys conducted by the Survey Research Center and the Center for Political Studies]",,ANES 1956 Time Series Study,"candidates,congressional elections,domestic policy,economic conditions,Eisenhower Administration (1953-1961),foreign policy,government performance,information sources,national elections,political affiliation,political attitudes,political campaigns,political efficacy,political issues,political participation,public approval,public opinion,special interest groups,trust in government,voter expectations,voting behavior,United States,1956-09--1957-01",ANES 1956 Time Series Study,10.3886/ICPSR07214,anestimeseriesstudy,title_1012
2,ICPSR,,,3,2016-09-22 00:00:00+00:00,"This study is part of a time-series collection of national surveys fielded continuously since 1948. The election studies are designed to present data on Americans' social backgrounds, enduring political predispositions, social and political values, perceptions and evaluations of groups and candidates, opinions on questions of public policy, and participation in political life. The 1958 study may be analyzed both on its own, as a cross-section survey representative of the U.S. population of voting age, and as the second wave of a panel study that started with the ANES 1956 Time Series Study (ICPSR 7214) and ended with the ANES 1960 Time Series Study (ICPSR 7216). Each respondent was interviewed only once, after the election. Respondents who had not been interviewed in 1956 were selected from dwelling units vacated by 1956 respondents (movers). The questionnaires contained both closed and open-ended questions covering a wide range of topics. In addition to general political attitudes, the study obtained information about the more specific attitudes and behaviors pertinent to the 1958 Congressional Election, like the respondents' actual vote and reasons for the vote, attitudes toward political parties and candidates, and the respondents' political history. Data were also collected on specific domestic and foreign policy issues such as government involvement in housing and public utilities, and United States aid to anti-Communist nations. The study also ascertained the financial situation of the family unit and other demographic information.",,"[{'name': 'ICPSR data ID (dataId)', 'identifier': '10.3886/ICPSR07215'}]",[],,ANES 1958 Time Series Study,"candidates,congressional elections,domestic policy,economic conditions,Eisenhower Administration (1953-1961),foreign policy,government performance,information sources,national elections,political affiliation,political attitudes,political campaigns,political efficacy,political issues,political participation,public approval,public opinion,special interest groups,trust in government,voter expectations,voter history,voting behavior,United States,1958-11--1958-12",ANES 1958 Time Series Study,10.3886/ICPSR07215,anestimeseriesstudy,title_1012
3,ICPSR,,,4,2015-11-10 00:00:00+00:00,"This study is part of a time-series collection of national surveys fielded continuously since 1952. The election studies are designed to present data on Americans' social backgrounds, enduring political predispositions, social and political values, perceptions and evaluations of groups and candidates, opinions on questions of public policy, and participation in political life.",,"[{'name': 'ICPSR data ID (dataId)', 'identifier': '10.3886/ICPSR07216'}]",[],,ANES 1960 Time Series Study,"candidates,congressional elections,domestic policy,economic conditions,Eisenhower Administration (1953-1961),foreign policy,government performance,information sources,national elections,political affiliation,political attitudes,political campaigns,political efficacy,political issues,political participation,presidential elections,public approval,public opinion,special interest groups,trust in government,voter expectations,voter history,voting behavior,United States,1960-09--1960-12",ANES 1960 Time Series Study,10.3886/ICPSR07216,anestimeseriesstudy,title_1012
4,ICPSR,,,5,2016-12-01 00:00:00+00:00,"This study is part of a time-series collection of national surveys fielded continuously since 1948. The election studies are designed to present data on Americans' social backgrounds, enduring political predispositions, social and political values, perceptions and evaluations of groups and candidates, opinions on questions of public policy, and participation in political life. The ANES 1962 Time Series Study is a traditional time series study, conducted face-to-face after the congressional election. The data were collected as part of the Survey Research Center Economic Behavior Program's Fall Omnibus Survey, which was designed to measure consumer confidence and optimism but also included questions in other areas such as political behavior and political attitudes. The questionnaire used served both the 1962 ANES and the Fall Omnibus, but the 1962 ANES excluded questions that were specifically gathered for the EBP survey alone. In addition to content on electoral participation, voting behavior, and public opinion, the 1962 ANES includes items on partisanship, government enforcement of school integration, and financial and business conditions.",,"[{'name': 'ICPSR data ID (dataId)', 'identifier': '10.3886/ICPSR07217'}]",[],,ANES 1962 Time Series Study,"candidates,congressional elections,domestic policy,economic conditions,foreign policy,government performance,information sources,Kennedy Administration (1961-1963),national elections,political affiliation,political attitudes,political campaigns,political efficacy,political issues,political participation,presidential elections,public approval,public opinion,special interest groups,trust in government,voter expectations,voter history,voting behavior,United States,1962-11--1962-12",ANES 1962 Time Series Study,10.3886/ICPSR07217,anestimeseriesstudy,title_1012
