In [1]:
import pandas as pd
import numpy as np
import sklearn as sk 


In [2]:
#read in mbti_cleaned.csv and drop 'Unnamed: 0' column
df = pd.read_csv('mbti_cleaned.csv', encoding = "ISO-8859-1")
df = df.drop('Unnamed: 0', axis=1)

#select only entries with no null values
df = df[pd.notnull(df['clean_posts'])]
df = df[pd.notnull(df['posts'])]
df.head()

Unnamed: 0,type,posts,clean_posts
0,INFJ,enfp and intj moments https://www.youtube.com...,enfp intj moments sportscenter top ten plays p...
1,INFJ,What has been the most life-changing experienc...,lifechanging experience life
2,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,repeat today
3,INFJ,May the PerC Experience immerse you.,may perc experience immerse
4,INFJ,The last thing my INFJ friend posted on his fa...,last thing infj friend posted facebook committ...


In [3]:
categories = df['type'].unique().tolist()

In [4]:
from sklearn import model_selection


"""for t in categories:
    new_df = df[df['type'] == t]
    new_df = new_df.drop(['type', 'posts'], axis=1)
    split_df = sk.model_selection.train_test_split(new_df, test_size=0.2, train_size=0.8)
    train, test = split_df[0], split_df[1]
    filename_train = t + '_train.csv'
    filename_test = t + '_test.csv'
    train.to_csv(filename_train)
    test.to_csv(filename_test)"""

In [5]:
import sklearn.datasets

posts_train = sk.datasets.load_files("mbti_train", description=None, 
                                     categories=categories, load_content=True, 
                                     shuffle=True, encoding="ISO-8859-1", 
                                     decode_error='strict', random_state=0)

In [6]:
for t in posts_train.target[:16]:
     print(posts_train.target_names[t])

ENFP
ESTJ
INFJ
INFP
ISFP
ESFJ
ENTJ
ISTJ
INTJ
ESTP
ISTP
INTP
ENTP
ENFJ
ESFP
ISFJ


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
#toeknizing text with scikit-learn
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(posts_train.data)
X_train_counts.shape

(16, 452881)

In [None]:
X_train_counts

In [8]:
count_vect.vocabulary_.get(u'algorithm')

329241

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(16, 452881)

In [10]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, posts_train.target)

In [18]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted1 = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, posts_train.target_names[category]))


'God is love' => ENFP
'OpenGL on the GPU is fast' => ESFJ


In [23]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB()),
 ])

In [28]:
text_clf.fit(posts_train.data, posts_train.target)  

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [29]:
posts_test = sk.datasets.load_files("mbti_test", description=None, 
                                     categories=categories, load_content=True, 
                                     shuffle=True, encoding="ISO-8859-1", 
                                     decode_error='strict', random_state=0)

docs_test = posts_test.data
predicted2 = text_clf.predict(docs_test)
np.mean(predicted2 == posts_test.target)

0.875

In [32]:
from sklearn.linear_model import SGDClassifier
text_clf2 = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
 ])

text_clf2.fit(posts_train.data, posts_train.target)


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [33]:
predicted3 = text_clf2.predict(docs_test)
np.mean(predicted3 == posts_test.target)

0.0625