# Comparing Big Data Technologies

# Get initial list

In [1]:
techs = "cascading, storm, spark, mapreduce, hadoop, hbase, cassandra, mongodb, couchdb, solr, elastic search, impala, flink, drill, hive".split(", ")

# Get metrics

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import pandas as pd
import numpy as np
import logging
import sys

import get_stack_overflow_data as get_stack
import get_github_data as get_git

In [4]:
logging.basicConfig(level=logging.INFO)

## Get Github data

In [5]:
with open("secrets/github-token.nogit", "rb") as f:
    token = f.read()
    
headers = {'Authorization': 'token %s' % token}    

In [6]:
# test one
get_git.from_search('hive', headers=headers)

{'forks': 1550, 'package': 'hive', 'repo': u'apache/hive', 'stars': 1345}

In [7]:
git = get_git.search_from_list_delayed(techs, headers)

INFO:root:Running chunk 1 of 1...
INFO:root:DONE.


In [9]:
df_git = pd.DataFrame(git)[['package', 'repo', 'forks', 'stars']]
df_git.head()

Unnamed: 0,package,repo,forks,stars
0,cascading,cwensel/cascading,220,295
1,storm,nathanmarz/storm,1791,8989
2,spark,apache/spark,12236,12987
3,mapreduce,cdmh/mapreduce,61,196
4,hadoop,apache/hadoop,3144,3302


## Get Stack Overflow data

### tags

In [10]:
# test
pd.DataFrame(get_stack.get_tag_counts(["hadoop", "spark"]))

Unnamed: 0,count,has_synonyms,is_moderator_only,is_required,name
0,33820,True,False,False,hadoop
1,27167,True,False,False,apache-spark


In [11]:
tags = get_stack.get_tag_counts(techs)

In [12]:
df_tags = pd.DataFrame(tags)[['name', 'count']]
df_tags.columns = ['package', 'so_tags']
df_tags.head()

Unnamed: 0,package,so_tags
0,mongodb,82701
1,hadoop,33820
2,apache-spark,27167
3,elasticsearch,25616
4,solr,15746


### body

In [13]:
# test
get_stack.get_single_body_count('mapreduce', tag="r")

{'query': 'mapreduce', 'tag': 'r', 'total': 121}

In [14]:
body = get_stack.get_body_counts(techs)

In [15]:
df_body = pd.DataFrame(body)[['query', 'total']]

In [16]:
df_body.columns = ['package', 'so_body']
df_body.head()

Unnamed: 0,package,so_body
0,cascading,13908
1,storm,3952
2,spark,33120
3,mapreduce,14100
4,hadoop,39761


# Combine data

In [17]:
df = df_git.merge(df_body, how='outer').merge(df_tags, how='outer')

## TODO
see how stack overflow is good at reading tags. But what about the question body ? how to manage it?

In [18]:
df

Unnamed: 0,package,repo,forks,stars,so_body,so_tags
0,cascading,cwensel/cascading,220.0,295.0,13908.0,307.0
1,storm,nathanmarz/storm,1791.0,8989.0,3952.0,
2,spark,apache/spark,12236.0,12987.0,33120.0,
3,mapreduce,cdmh/mapreduce,61.0,196.0,14100.0,10391.0
4,hadoop,apache/hadoop,3144.0,3302.0,39761.0,33820.0
5,hbase,apache/hbase,1126.0,1259.0,7001.0,5412.0
6,cassandra,apache/cassandra,1631.0,3584.0,15296.0,12281.0
7,mongodb,doctrine/mongodb,180.0,285.0,82715.0,82701.0
8,couchdb,apache/couchdb,682.0,3009.0,6156.0,4809.0
9,solr,ekoontz/solr,185.0,6.0,18478.0,15746.0
