In [38]:
#!/bin/python
# xander johnson @metasyn
# data from: http://www.ethnologue.com/statistics/size

import json
import re
import pandas as pd
import matplotlib.pyplot as plt

% matplotlib inline

# read in data
with open('./ethnologue.txt', 'r') as f:
	data = f.readlines()
f.close()

children = []
for line in data[1:]:
	lang_dict={}
	splits = line.split('(')
	lang_dict["name"] = splits[0].strip()
	lang_dict["size"] = splits[1][:-2]
	children.append(lang_dict)

flare = {
	"name": "flare",
	"children": children
}

with open('numlangs.json', 'wb') as fp:
	json.dump(flare, fp)

with open('./stats.txt', 'r') as f:
	stats = f.readlines()
f.close()

regex = ur"""
(?P<num>\d+)?
\s+
(?P<family>[a-z\-A-Z]+)
\,?\s
(?P<variety>[^\[]+\s?)?
\s?
(?:\[)
(?P<code>\w+)
(?:\])
\t
(?P<location>[a-zA-Z]+\s?[a-zA-Z]+)
\t
(?P<total_countries>\d+)
\s+
(?P<speakers>[\d\.\,]+)
"""

main=[]
for line in stats:
	stats_extract = re.findall(regex, line, re.X)
	main.append(stats_extract[0])

df = pd.DataFrame(main,columns=['num', 'family', 'variety', 'code', 'place', 'numCountries', 'speakers'])


In [9]:
df.head()

Unnamed: 0,num,family,variety,code,place,numCountries,speakers
0,1.0,Chinese,,zho,China,33,1197.0
1,,Chinese,Gan,gan,China,1,20.6
2,,Chinese,Hakka,hak,China,13,30.1
3,,Chinese,Huizhou,czh,China,1,4.6
4,,Chinese,Jinyu,cjy,China,1,45.0


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 0 to 69
Data columns (total 7 columns):
num             70 non-null object
family          70 non-null object
variety         70 non-null object
code            70 non-null object
place           70 non-null object
numCountries    70 non-null int64
speakers        70 non-null object
dtypes: int64(1), object(6)
memory usage: 4.4+ KB


In [106]:
df.numCountries = df.numCountries.astype('int')
df.speakers = df.speakers.str.replace(',', '')
df.speakers = df.speakers.astype('float')

In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 0 to 69
Data columns (total 7 columns):
num             70 non-null object
family          70 non-null object
variety         70 non-null object
code            70 non-null object
place           70 non-null object
numCountries    70 non-null int64
speakers        70 non-null float64
dtypes: float64(1), int64(1), object(5)
memory usage: 4.4+ KB


In [108]:
df[df['family']=='Chinese']

Unnamed: 0,num,family,variety,code,place,numCountries,speakers
0,1.0,Chinese,,zho,China,33,1197.0
1,,Chinese,Gan,gan,China,1,20.6
2,,Chinese,Hakka,hak,China,13,30.1
3,,Chinese,Huizhou,czh,China,1,4.6
4,,Chinese,Jinyu,cjy,China,1,45.0
5,,Chinese,Mandarin,cmn,China,12,848.0
6,,Chinese,Min Bei,mnp,China,2,10.3
7,,Chinese,Min Dong,cdo,China,6,9.12
8,,Chinese,Min Nan,,China,10,46.6
9,,Chinese,Min Zhong,czo,China,1,3.1


In [141]:
main = {}
main["name"] = "langs"
main_children = []
for fam in list(set(df.family.values)):
    current_fam = df[df.family==fam]
    varities = [v for v in current_fam.variety.values if len(v) > 0]
    children = []
    for v in varities:
        children.append({"name":fam+', '+v.strip(), "size": float(df[df.variety==v].speakers.values)})
    if len(varities) == 0:
        children.append({"name":fam, "size": float(df[df.family==fam].speakers.values)})
    family_dict = {"name": fam, "children": children}
    main_children.append(family_dict)
main["children"] = main_children

In [142]:
main

{'children': [{'children': [{'name': 'Telugu', 'size': 74.0}],
   'name': 'Telugu'},
  {'children': [{'name': 'Turkish', 'size': 70.9}], 'name': 'Turkish'},
  {'children': [{'name': 'Marathi', 'size': 71.8}], 'name': 'Marathi'},
  {'children': [{'name': 'English', 'size': 335.0}], 'name': 'English'},
  {'children': [{'name': 'Pahari-Potwari', 'size': 2.5}],
   'name': 'Pahari-Potwari'},
  {'children': [{'name': 'Hindi', 'size': 260.0}], 'name': 'Hindi'},
  {'children': [{'name': 'Korean', 'size': 77.2}], 'name': 'Korean'},
  {'children': [{'name': 'Saraiki', 'size': 20.1}], 'name': 'Saraiki'},
  {'children': [{'name': 'Indonesian', 'size': 23.2}], 'name': 'Indonesian'},
  {'children': [{'name': 'Vietnamese', 'size': 67.8}], 'name': 'Vietnamese'},
  {'children': [{'name': 'Malay, Central', 'size': 1.59},
    {'name': 'Malay, Jambi', 'size': 1.0},
    {'name': 'Malay, Kedah', 'size': 2.6},
    {'name': 'Malay, Pattani', 'size': 1.0}],
   'name': 'Malay'},
  {'children': [{'name': 'French

In [143]:
with open('./stats.json', 'wb') as fp:
    json.dump(main, fp)
fp.close()

In [144]:
ls

Untitled.ipynb  ethnologue.txt  langstats.json  stats.json      style.css       treemap.html
[34md3[m[m/             flare.json      numlangs.json   stats.txt       transform.py
