**38. Obtenga la matriz de correlación para saber idiomas distintos considerando que un
usuario sabe un idioma si indicó un nivel de 1 o superior (⭐⭐)**

In [None]:
import pandas as pd
import numpy as np
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Para este ejercicio voy a utilizar **languages.csv**

In [None]:
id='1KHw8c5IEqjmijtyOau0y-wJAm-fenSqZ'
downloaded = drive.CreateFile({'id': id})
downloaded.GetContentFile('languages.csv')

In [None]:
# Cargo los datos y luego le doy una mirada
languages = pd.read_csv('languages.csv', encoding='latin-1')

languages = languages.dropna()
languages.head()

Unnamed: 0,babel_user,babel_lang,babel_level
0,4502458,FR,2
1,5928200,aa,0
2,46918,ab,0
3,2050449,ab,1
4,4715583,ace,0


Para tener una primer idea, veamos cuantos idiomas hay:

In [None]:
languages['babel_lang'].value_counts()

es     10463
en      9467
fr      3556
it      1788
de      1729
       ...  
gym        1
bn         1
mrj        1
tzh        1
rom        1
Name: babel_lang, Length: 317, dtype: int64

Podemos ver que hay **317** idiomas distintos, donde los que mas predominan son el español e ingles.

Veamos ahora los distintos niveles;

In [None]:
languages['babel_level'].value_counts()

N    11533
1     7589
2     6452
3     5850
4     2504
0     1090
5      644
Name: babel_level, dtype: int64

A nosotros solo nos interesan los datos cuyos niveles sean 1 o mayor:

In [None]:
languages = languages[languages['babel_level'] != '0']
languages['babel_level'].value_counts()

N    11533
1     7589
2     6452
3     5850
4     2504
5      644
Name: babel_level, dtype: int64

Podemos ver que la cantidad de idiomas disminuyo, había muchos idiomas que figuraban y que ningún usuario tenia conocimientos del mismo:

In [None]:
languages['babel_lang'].value_counts()

es       10223
en        9454
fr        3487
it        1743
de        1644
         ...  
sdn          1
mi           1
de-at        1
miq          1
tzh          1
Name: babel_lang, Length: 249, dtype: int64

Como ya tenemos los niveles que nos interesan, elimino la columna que indica el nivel y agrego una columna que indica True(1) si sabe el idioma.

In [None]:
languages = languages.drop(['babel_level'], axis=1)
languages['knows_language'] = 1
languages.head()

Unnamed: 0,babel_user,babel_lang,knows_language
0,4502458,FR,1
3,2050449,ab,1
6,4494742,acf,1
8,70090,af,1
9,407454,af,1


Hago un pivot donde tomo como valor la columna de si sabe el idioma y las columnas del mismo serán los distintos idiomas.
También reemplazo los NaN por ceros para luego poder hacer la matriz.

In [None]:
languages_pivot = languages.pivot(values='knows_language', columns='babel_lang', index='babel_user').fillna(0)
languages_pivot = languages_pivot.reset_index().drop(['babel_user'], axis=1)
languages_pivot

babel_lang,FR,ab,acf,af,agr,akk,aln,am,an,ang,ar,arc,arn,ary,ase,ast,avk,ay,az,azb,bar,bcl,be,bew,bg,bjn,bn,br,bs,ca,cak,cas,cbk,ceb,ch,chr,ckb,cmn,cnt,co,...,su,sv,sw,ta,te,tet,tg,th,ti,tk,tl,tmr,tpi,tr,tzh,tzm,tzo,ugy,uk,ur,uz,vec,vi,vsv,wa,war,wuu,xcw,yi,yua,yue,zap,zea,zh,zh-Hans,zh-Hans-CN,zh-Hant,zh-Hant-HK,zh-Hant-TW,zza
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10804,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10805,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10806,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10807,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Finalmente la matriz de correlación entre los idiomas:

In [None]:
corr = languages_pivot.corr()
corr.reset_index()

babel_lang,babel_lang.1,FR,ab,acf,af,agr,akk,aln,am,an,ang,ar,arc,arn,ary,ase,ast,avk,ay,az,azb,bar,bcl,be,bew,bg,bjn,bn,br,bs,ca,cak,cas,cbk,ceb,ch,chr,ckb,cmn,cnt,...,su,sv,sw,ta,te,tet,tg,th,ti,tk,tl,tmr,tpi,tr,tzh,tzm,tzo,ugy,uk,ur,uz,vec,vi,vsv,wa,war,wuu,xcw,yi,yua,yue,zap,zea,zh,zh-Hans,zh-Hans-CN,zh-Hant,zh-Hant-HK,zh-Hant-TW,zza
0,FR,1.000000,-0.000093,-0.000093,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,-0.000160,-0.000359,-0.000093,-0.000131,-0.001149,-0.000093,-0.000185,-0.000227,-0.000093,-0.000227,-0.000093,-0.000454,-0.000093,-0.000507,-0.000093,-0.000093,-0.000131,-0.000334,-0.003539,-0.000093,-0.000093,-0.000093,-0.000160,-0.000131,-0.000093,-0.000093,-0.000093,-0.000093,...,-0.000160,-0.000980,-0.000262,-0.000185,-0.000131,-0.000093,-0.000131,-0.000359,-0.000093,-0.000093,-0.000278,-0.000093,-0.000093,-0.000571,-0.000093,-0.000093,-0.000093,-0.000093,-0.000748,-0.000207,-0.000185,-0.000293,-0.000278,-0.000093,-0.000131,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,-0.000093,-0.001290,-0.000093,-0.000160,-0.000093,-0.000093,-0.000093,-0.000093
1,ab,-0.000093,1.000000,-0.000093,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,-0.000160,-0.000359,-0.000093,-0.000131,-0.001149,-0.000093,-0.000185,-0.000227,-0.000093,-0.000227,-0.000093,-0.000454,-0.000093,-0.000507,-0.000093,-0.000093,-0.000131,-0.000334,-0.003539,-0.000093,-0.000093,-0.000093,-0.000160,-0.000131,-0.000093,-0.000093,-0.000093,-0.000093,...,-0.000160,-0.000980,-0.000262,-0.000185,-0.000131,-0.000093,-0.000131,-0.000359,-0.000093,-0.000093,-0.000278,-0.000093,-0.000093,0.161944,-0.000093,-0.000093,-0.000093,-0.000093,-0.000748,-0.000207,-0.000185,-0.000293,-0.000278,-0.000093,-0.000131,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,-0.000093,-0.001290,-0.000093,-0.000160,-0.000093,-0.000093,-0.000093,-0.000093
2,acf,-0.000093,-0.000093,1.000000,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,-0.000160,-0.000359,-0.000093,-0.000131,-0.001149,-0.000093,-0.000185,-0.000227,-0.000093,-0.000227,-0.000093,-0.000454,-0.000093,-0.000507,-0.000093,-0.000093,-0.000131,-0.000334,-0.003539,-0.000093,-0.000093,-0.000093,-0.000160,-0.000131,-0.000093,-0.000093,-0.000093,-0.000093,...,-0.000160,-0.000980,-0.000262,-0.000185,-0.000131,-0.000093,-0.000131,-0.000359,-0.000093,-0.000093,-0.000278,-0.000093,-0.000093,-0.000571,-0.000093,-0.000093,-0.000093,-0.000093,-0.000748,-0.000207,-0.000185,-0.000293,-0.000278,-0.000093,-0.000131,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,-0.000093,-0.001290,-0.000093,-0.000160,-0.000093,-0.000093,-0.000093,-0.000093
3,af,-0.000293,-0.000293,-0.000293,1.000000,-0.000293,-0.000293,-0.000414,-0.000414,-0.002216,-0.000717,0.021380,-0.000507,-0.001134,-0.000293,-0.000414,0.022209,-0.000293,-0.000585,-0.000717,-0.000293,-0.000717,-0.000293,-0.001436,-0.000293,-0.001605,-0.000293,-0.000293,-0.000414,-0.001056,0.007582,-0.000293,-0.000293,-0.000293,-0.000507,-0.000414,-0.000293,-0.000293,-0.000293,-0.000293,...,-0.000507,0.057269,-0.000828,-0.000585,-0.000414,-0.000293,-0.000414,-0.001134,-0.000293,-0.000293,-0.000878,-0.000293,-0.000293,0.049606,-0.000293,-0.000293,-0.000293,-0.000293,-0.002367,-0.000655,-0.000585,-0.000926,-0.000878,-0.000293,-0.000414,-0.000293,-0.000293,-0.000293,-0.000828,-0.000414,-0.000878,-0.000293,-0.000293,-0.004081,-0.000293,-0.000507,-0.000293,-0.000293,-0.000293,-0.000293
4,agr,-0.000093,-0.000093,-0.000093,-0.000293,1.000000,1.000000,-0.000131,-0.000131,-0.000700,-0.000227,0.078226,-0.000160,-0.000359,-0.000093,0.707074,-0.001149,-0.000093,0.499931,-0.000227,-0.000093,-0.000227,-0.000093,-0.000454,-0.000093,-0.000507,-0.000093,-0.000093,-0.000131,-0.000334,-0.003539,-0.000093,-0.000093,-0.000093,-0.000160,-0.000131,-0.000093,-0.000093,-0.000093,-0.000093,...,-0.000160,-0.000980,0.353439,-0.000185,-0.000131,-0.000093,-0.000131,0.258032,-0.000093,-0.000093,-0.000278,-0.000093,-0.000093,0.161944,-0.000093,-0.000093,-0.000093,-0.000093,-0.000748,-0.000207,0.499931,-0.000293,-0.000278,-0.000093,-0.000131,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,-0.000278,-0.000093,-0.000093,0.071719,-0.000093,-0.000160,-0.000093,-0.000093,-0.000093,-0.000093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
244,zh-Hans-CN,-0.000160,-0.000160,-0.000160,-0.000507,-0.000160,-0.000160,-0.000227,-0.000227,-0.001213,-0.000393,-0.002049,-0.000278,-0.000621,-0.000160,-0.000227,-0.001990,-0.000160,-0.000321,-0.000393,-0.000160,-0.000393,-0.000160,-0.000786,-0.000160,-0.000879,-0.000160,-0.000160,-0.000227,-0.000578,-0.006131,-0.000160,-0.000160,-0.000160,-0.000278,-0.000227,-0.000160,-0.000160,-0.000160,-0.000160,...,-0.000278,-0.001697,-0.000453,-0.000321,-0.000227,-0.000160,-0.000227,-0.000621,-0.000160,-0.000160,-0.000481,-0.000160,-0.000160,-0.000990,-0.000160,-0.000160,-0.000160,-0.000160,-0.001296,-0.000358,-0.000321,-0.000507,-0.000481,-0.000160,-0.000227,-0.000160,-0.000160,-0.000160,-0.000453,-0.000227,0.192076,-0.000160,-0.000160,0.082076,0.577297,1.000000,0.577297,0.577297,0.577297,-0.000160
245,zh-Hant,-0.000093,-0.000093,-0.000093,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,-0.000160,-0.000359,-0.000093,-0.000131,-0.001149,-0.000093,-0.000185,-0.000227,-0.000093,-0.000227,-0.000093,-0.000454,-0.000093,-0.000507,-0.000093,-0.000093,-0.000131,-0.000334,-0.003539,-0.000093,-0.000093,-0.000093,-0.000160,-0.000131,-0.000093,-0.000093,-0.000093,-0.000093,...,-0.000160,-0.000980,-0.000262,-0.000185,-0.000131,-0.000093,-0.000131,-0.000359,-0.000093,-0.000093,-0.000278,-0.000093,-0.000093,-0.000571,-0.000093,-0.000093,-0.000093,-0.000093,-0.000748,-0.000207,-0.000185,-0.000293,-0.000278,-0.000093,-0.000131,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,0.333210,-0.000093,-0.000093,0.071719,1.000000,0.577297,1.000000,1.000000,1.000000,-0.000093
246,zh-Hant-HK,-0.000093,-0.000093,-0.000093,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,-0.000160,-0.000359,-0.000093,-0.000131,-0.001149,-0.000093,-0.000185,-0.000227,-0.000093,-0.000227,-0.000093,-0.000454,-0.000093,-0.000507,-0.000093,-0.000093,-0.000131,-0.000334,-0.003539,-0.000093,-0.000093,-0.000093,-0.000160,-0.000131,-0.000093,-0.000093,-0.000093,-0.000093,...,-0.000160,-0.000980,-0.000262,-0.000185,-0.000131,-0.000093,-0.000131,-0.000359,-0.000093,-0.000093,-0.000278,-0.000093,-0.000093,-0.000571,-0.000093,-0.000093,-0.000093,-0.000093,-0.000748,-0.000207,-0.000185,-0.000293,-0.000278,-0.000093,-0.000131,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,0.333210,-0.000093,-0.000093,0.071719,1.000000,0.577297,1.000000,1.000000,1.000000,-0.000093
247,zh-Hant-TW,-0.000093,-0.000093,-0.000093,-0.000293,-0.000093,-0.000093,-0.000131,-0.000131,-0.000700,-0.000227,-0.001183,-0.000160,-0.000359,-0.000093,-0.000131,-0.001149,-0.000093,-0.000185,-0.000227,-0.000093,-0.000227,-0.000093,-0.000454,-0.000093,-0.000507,-0.000093,-0.000093,-0.000131,-0.000334,-0.003539,-0.000093,-0.000093,-0.000093,-0.000160,-0.000131,-0.000093,-0.000093,-0.000093,-0.000093,...,-0.000160,-0.000980,-0.000262,-0.000185,-0.000131,-0.000093,-0.000131,-0.000359,-0.000093,-0.000093,-0.000278,-0.000093,-0.000093,-0.000571,-0.000093,-0.000093,-0.000093,-0.000093,-0.000748,-0.000207,-0.000185,-0.000293,-0.000278,-0.000093,-0.000131,-0.000093,-0.000093,-0.000093,-0.000262,-0.000131,0.333210,-0.000093,-0.000093,0.071719,1.000000,0.577297,1.000000,1.000000,1.000000,-0.000093


Aclaración: se que no esta del todo bien la matriz ya que lo ideal sería que la columna de indice sea la actual babel_lang. Sinceramente me peleé bastante con esto y no pude encontrar una solución.

Por lo que se puede ver en la matriz hay muchos valores iguales (y negativos), esto sucede ya que hay muchos idiomas que son poco conocidos.

Veamos algunos casos de los idiomas mas hablados:

In [None]:
corr['es'].value_counts()

 0.002303    12
 0.002303     6
 0.002303     4
-0.040176     4
 0.002303     4
             ..
 0.006516     1
-0.016556     1
 0.002303     1
 0.042766     1
 1.000000     1
Name: es, Length: 214, dtype: int64

In [None]:
corr['en'].value_counts()

0.003642    12
0.003642     6
0.003642     4
0.003642     4
0.003642     4
            ..
0.005150     1
0.003642     1
0.011039     1
0.006308     1
1.000000     1
Name: en, Length: 211, dtype: int64