In [0]:
!pip install --upgrade -q gspread

In [29]:
from google.colab import files
import pandas as pd
from google.colab import auth
import getpass

import seaborn as sns
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from plotly.graph_objs import *

init_notebook_mode(connected=True)
%matplotlib inline

In [0]:
# setup connection to google drive
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

# Connect a Google Drive folder 
named *memetics* to this Colab instance for data storage. Pew. Not completely easy-peasy.

In [9]:
!mkdir -p drive
!google-drive-ocamlfuse drive
!mkdir drive/memetics
!ls drive/memetics

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option


In [7]:
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()


!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [10]:
!ls drive/memetics

dataframe_survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched.pickle
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text.csv
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text.csv.ods
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text_jung_actual_uclassify.ods
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text_liwc1.csv
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text_liwc2.csv
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text_liwc3.csv
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text_liwc4.csv
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text_liwc5.csv
survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched_no-text_liwc6.csv


# Load survey-data to Pandas DataFrame

Cleaned and enriched version of the Typealyzer survey data with text column removed to save memory.

All pre-processing steps are documented in Notebooks in the private [Gitlab repo Jung-Meyers-tagger](https://gitlab.com/memetic-science/Jung-Myers-tagger).

The file is 37,4 MB and saved in Gitlab repo under */data/processed/*.

Files are converted to byte-strings when uploaded to Google Colaboratory and needs to be encoded with .encode("utf-8") to be manipulated.


In [26]:
df = pd.read_csv("drive/memetics/survey_2018-01-23_jung-liwc-dt-jung_dummies-enriched.csv", sep=";")
df.head()


Columns (17) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0.1,Unnamed: 0,url,typealyzer,actual,e,s,t,sntf_s,sntf_n,sntf_t,...,home,cause,assent,verb,auxverb,death,time,discrep,health,datetime
0,0,http://jonkagstrom.com,ISTP,INFJ,0.420758,0.651605,0.652214,0.512359,0.274234,0.134025,...,0.0047,0.017518,0.04358,0.190558,0.125614,0.005982,0.065157,0.012177,0.02414,2012-08-28 09:08:55
1,1,http://adropofcolour.tumblr.com,ISFP,INFJ,0.291281,0.787844,0.460961,0.663515,0.178565,0.069282,...,0.005663,0.023783,0.03171,0.180068,0.10872,0.00453,0.069083,0.016988,0.011325,2012-08-28 08:08:11
2,2,http://godheadcomplex.tumblr.com,ESFP,INFP,0.883579,0.951693,0.238407,0.855921,0.046931,0.02185,...,0.019704,0.004926,0.039409,0.206897,0.147783,0.014778,0.024631,0.014778,0.014778,2012-08-28 09:08:34
3,3,http://chaotikaeon2.tumblr.com,INTJ,INTP,0.332444,0.357863,0.591322,0.147668,0.252326,0.339831,...,0.0,0.014011,0.028021,0.162872,0.084063,0.005254,0.049037,0.015762,0.028021,2012-08-28 10:08:31
4,4,http://codeode.com,ESTJ,I don't know,0.556231,0.881763,0.969999,0.449375,0.065664,0.466337,...,0.006375,0.021514,0.042231,0.18247,0.123506,0.008765,0.07012,0.009562,0.027888,2012-08-29 12:08:35


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27959 entries, 0 to 27958
Data columns (total 83 columns):
Unnamed: 0    27959 non-null int64
url           27959 non-null object
typealyzer    27959 non-null object
actual        27959 non-null object
e             27959 non-null float64
s             27959 non-null float64
t             27959 non-null float64
sntf_s        27959 non-null float64
sntf_n        27959 non-null float64
sntf_t        27959 non-null float64
sntf_f        27959 non-null float64
date          27959 non-null object
text          27959 non-null object
domains       27959 non-null object
domain        27959 non-null object
func          27298 non-null object
att           27298 non-null object
funcatt       23804 non-null object
sexual        27959 non-null float64
i             27959 non-null float64
filler        27959 non-null float64
you           27959 non-null float64
preps         27959 non-null float64
friend        27959 non-null float64
affect        2

## Scikit-learn classification report
See [documentation](http://scikit-learn.org/stable/modules/model_evaluation.html#classification-report)

In [27]:
cr = classification_report(df['actual'], df['typealyzer'])
print(cr)

              precision    recall  f1-score   support

        ENFJ       0.04      0.01      0.02       613
        ENFP       0.06      0.01      0.02      1564
        ENTJ       0.04      0.02      0.03       461
        ENTP       0.07      0.03      0.05       989
        ESFJ       0.02      0.10      0.03       292
        ESFP       0.02      0.35      0.04       451
        ESTJ       0.02      0.06      0.03       140
        ESTP       0.01      0.21      0.03       239
I don't know       0.00      0.00      0.00       661
        INFJ       0.19      0.01      0.01      4107
        INFP       0.22      0.03      0.05      4792
        INTJ       0.30      0.05      0.09      4699
        INTP       0.20      0.10      0.13      3911
        ISFJ       0.05      0.03      0.04      1272
        ISFP       0.07      0.22      0.11      1508
        ISTJ       0.05      0.05      0.05      1138
        ISTP       0.05      0.21      0.08      1122

 avg / total       0.16   


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.



The result is bad precision and recall values.

## Any imbalance between predicted and actual types? 

In [30]:
actual = df['actual'].value_counts()
predicted = df['typealyzer'].value_counts()

data = [Bar(x = actual.index.values, y = actual.values, name="Actual"),
       Bar(x = predicted.index.values, y = predicted.values, name="Typealyzer")]
iplot(data)

In [0]:
df

In [16]:
!cat /proc/meminfo

MemTotal:       13341892 kB
MemFree:         1156064 kB
MemAvailable:   12147196 kB
Buffers:          143680 kB
Cached:         10525300 kB
SwapCached:            0 kB
Active:          2406484 kB
Inactive:        9109636 kB
Active(anon):     847464 kB
Inactive(anon):      328 kB
Active(file):    1559020 kB
Inactive(file):  9109308 kB
Unevictable:           0 kB
Mlocked:               0 kB
SwapTotal:             0 kB
SwapFree:              0 kB
Dirty:              2928 kB
Writeback:             0 kB
AnonPages:        847176 kB
Mapped:           168948 kB
Shmem:               668 kB
Slab:             601580 kB
SReclaimable:     576220 kB
SUnreclaim:        25360 kB
KernelStack:        3136 kB
PageTables:         6044 kB
NFS_Unstable:          0 kB
Bounce:                0 kB
WritebackTmp:          0 kB
CommitLimit:     6670944 kB
Committed_AS:    1907732 kB
VmallocTotal:   34359738367 kB
VmallocUsed:           0 kB
VmallocChunk:          0 kB
AnonHugePag