# Comparing Big Data Technologies

# Get initial list

In [125]:
techs = ("cascading, storm, spark, mapreduce, hadoop, hbase, cassandra, "
         "mongo, mongodb, couchdb, lucene, solr, lucene-solr, "
         "elasticsearch, impala, flink, drill, apache-drill, apache drill, "
         "hive, scalding")
techs = techs.split(", ")
len(techs)

21

# Get metrics

In [126]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [127]:
import pandas as pd
import numpy as np
import logging
import sys

import get_stack_overflow_data as get_stack
import get_github_data as get_git

In [128]:
logging.basicConfig(level=logging.INFO)

In [129]:
with open("secrets/github-token.nogit", "rb") as f:
    token = f.read()
    
headers = {'Authorization': 'token %s' % token}    

## Get Github data

In [130]:
#get_git.from_search('lucene-solr', headers=headers)       # test

In [194]:
git = get_git.search_from_list_delayed(techs, headers)

df_git = pd.DataFrame(git)[['package', 'repo', 'forks', 'stars']]
df_git.tail()

INFO:root:Running chunk 1 of 1...
INFO:root:DONE.


Unnamed: 0,package,repo,forks,stars
0,cascading,cwensel/cascading,220,295
1,storm,nathanmarz/storm,1792,8989
2,spark,apache/spark,12250,13007
3,mapreduce,cdmh/mapreduce,61,196
4,hadoop,apache/hadoop,3145,3304


## Get Stack Overflow data

### tags

In [133]:
# pd.DataFrame(get_stack.get_tag_counts(["map reduce", "spark"]))   # test

In [165]:
tags = get_stack.get_tag_counts(techs)

In [166]:
df_tags = pd.DataFrame(tags)[['name', 'count']]
df_tags.columns = ['package', 'so_tags']
df_tags.head()

Unnamed: 0,package,so_tags
0,mongodb,82783
1,hadoop,33852
2,apache-spark,27237
3,elasticsearch,25662
4,solr,15756


### question body

In [167]:
# get_stack.get_body_counts(['mapreduce'])       # test

In [168]:
body = get_stack.get_body_counts(techs)

In [169]:
df_body = pd.DataFrame(body)[['query', 'total']]

In [170]:
df_body.columns = ['package', 'so_body']
df_body.head()

Unnamed: 0,package,so_body
0,cascading,13920
1,storm,3957
2,spark,33193
3,mapreduce,14105
4,hadoop,39803


# Process results

In [140]:
df_git

Unnamed: 0,package,repo,forks,stars
0,cascading,cwensel/cascading,220,295
1,storm,nathanmarz/storm,1792,8989
2,spark,apache/spark,12250,13007
3,mapreduce,cdmh/mapreduce,61,196
4,hadoop,apache/hadoop,3145,3304
5,hbase,apache/hbase,1126,1262
6,cassandra,apache/cassandra,1637,3583
7,mongo,mongodb/mongo,3133,11573
8,mongodb,doctrine/mongodb,180,285
9,couchdb,apache/couchdb,682,3014


In [181]:
df_body

Unnamed: 0,package,so_body
0,cascading,13920
1,storm,3957
2,spark,33193
3,mapreduce,14105
4,hadoop,39803
5,hbase,7003
6,cassandra,15307
7,mongo,32938
8,mongodb,82797
9,couchdb,6162


In [186]:
df_tags

Unnamed: 0,package,so_tags
0,mongodb,82783
1,hadoop,33852
2,apache-spark,27237
3,elasticsearch,25662
4,solr,15756
5,cassandra,12286
6,hive,10759
7,mapreduce,10395
9,hbase,5413
10,couchdb,4814


In [207]:
df_git

Unnamed: 0,package,repo,forks,stars
0,cascading,cwensel/cascading,220,295
1,storm,nathanmarz/storm,1792,8989
2,spark,apache/spark,12250,13007
4,hadoop,apache/hadoop,3145,3304
5,hbase,apache/hbase,1126,1262
6,cassandra,apache/cassandra,1637,3583
9,couchdb,apache/couchdb,682,3014
12,solr,apache/lucene-solr,851,877
13,elasticsearch,elastic/elasticsearch,8064,22937
14,impala,cloudera/Impala,722,1803


In [206]:
# mapreduce has no repo; mongodb -> mongo; lucene, solr -> lucene-solr
# drill, apache drill -> apache-drill
bad_git = 'mapreduce, mongodb, lucene, solr, drill, apache drill'.split(", ")

df_git = df_git[~df_git['package'].isin(bad_git)]

df_git['package'] = df_git['package'].str.replace('mongo$', 'mongodb')
df_git['package'] = df_git['package'].str.replace('lucene-solr', 'solr')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [201]:
# lucene -> solr, for merge with lucene-solr. also solr has more results so take it
# drill, apache drill -> apache-drill
bad_so = 'lucene, lucene-solr, mongo, drill, apache drill'.split(", ")

df_tags = df_tags[~df_tags['package'].isin(bad_so)]
df_body = df_body[~df_body['package'].isin(bad_so)]

In [202]:
# remove SO or git artifacts for merge

In [203]:
df_tags['package'] = df_tags['package'].str.replace('apache-', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


# Results!

In [204]:
df = df_git.merge(df_body, how='outer').merge(df_tags, how='outer')

In [205]:
df

Unnamed: 0,package,repo,forks,stars,so_body,so_tags
0,cascading,cwensel/cascading,220.0,295.0,13920.0,307.0
1,storm,nathanmarz/storm,1792.0,8989.0,3957.0,2005.0
2,spark,apache/spark,12250.0,13007.0,33193.0,27237.0
3,hadoop,apache/hadoop,3145.0,3304.0,39803.0,33852.0
4,hbase,apache/hbase,1126.0,1262.0,7003.0,5413.0
5,cassandra,apache/cassandra,1637.0,3583.0,15307.0,12286.0
6,mongodb,mongodb/mongo,3133.0,11573.0,82797.0,82783.0
7,couchdb,apache/couchdb,682.0,3014.0,6162.0,4814.0
8,lucene-solr,apache/lucene-solr,851.0,877.0,,
9,elasticsearch,elastic/elasticsearch,8064.0,22937.0,28921.0,25662.0
