In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
import seaborn as sns

# to automate the NLP extraction...
from sklearn.feature_extraction.text import CountVectorizer

# Cross_val_score is the new class for today...
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

# from sklearn.preprocessing import StandardScaler
# from sklearn.datasets import make_moons, make_circles, make_classification

# main ones to focus on for this sprint
from sklearn.linear_model import LogisticRegression
# from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
#from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA

# Covered in sprint 3
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier

# Covered in sprint 4
# from sklearn.gaussian_process import GaussianProcessClassifier as GPC
# from sklearn.gaussian_process.kernels import RBF

# Neural Network!!
#from sklearn.neural_network import MLPClassifier as Neural

In [2]:
from sklearn.pipeline import Pipeline

In [3]:
# import the datasets

mbti = pd.read_csv('mbti_1.csv')

train = pd.read_csv('train.csv')

test = pd.read_csv('test.csv')

essays = pd.read_csv('Essay_data.csv')

<h1>Part I : Preprocessing</h1>

In [4]:
# functions for removing punctuation, url's, splitting the rows and converting posts to lower case

def split_rows(df):
    """
        Takes a dataframe as input, splits the posts in the posts column on the '|||' separating each post
        and returns a new, clean dataframe.
    """
    all_df = []
    for i, row in df.iterrows():
        for post in row['posts'].split('|||'):
            all_df.append([row['type'], post])
    all_df = pd.DataFrame(all_df, columns=['type', 'post'])
    return all_df

def remove_urls(df):
    """
        Takes a dataframe df as input and a raw string literal sub, returns a dataframe with posts
        where all the urls have been replaced by sub.
    """
    sub = r'url-web'
    pattern_url = r'http[s]?://(?:[A-Za-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9A-Fa-f][0-9A-Fa-f]))+'
    df['post'] = df['post'].replace(to_replace = pattern_url, value = sub, regex = True)
    return df

## lambda function for converting posts into lowercase, takes a series as input and returns a series as output
lower = lambda x: x['post'].str.lower()

def remove_punc(all_mbti):
    """
        Removes punctuation from the posts in the dataframe.
    """
    #remove punctuation
    import string
    punc_numbers = string.punctuation + '0123456789'
    temp = lambda s: ''.join([l for l in s if l not in punc_numbers])
    all_mbti['post'] = all_mbti['post'].apply(temp)
    return all_mbti

In [5]:
def get_accuracy(data, y_tr,X_te,y_te, model):
    model.fit(data.toarray(), y_tr)
    
    pred = model.predict(X_te.toarray())
    
    return metrics.accuracy_score(y_te, pred)

def get_f1(data, y_tr, X_te, y_te, model):
    model.fit(data.toarray(), y_tr)
    
    pred = model.predict(X_te.toarray())
    return metrics.f1_score(y_te, pred)

<h2>Apply The Pre-Processing steps</h2>

In [6]:
mbti.shape

(8675, 2)

In [7]:
mbti.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [8]:
mbti.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
type     8675 non-null object
posts    8675 non-null object
dtypes: object(2)
memory usage: 135.6+ KB


In [9]:
mbti.describe()

Unnamed: 0,type,posts
count,8675,8675
unique,16,8675
top,INFP,'yes ; ; this has very much been my wish for a...
freq,1832,1


In [10]:
train.shape

(6506, 2)

In [11]:
# Unpack the posts

train = split_rows(train)

train.shape

(316548, 2)

In [12]:
train.head()

Unnamed: 0,type,post
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...
2,INFJ,enfp and intj moments https://www.youtube.com...
3,INFJ,What has been the most life-changing experienc...
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...


In [13]:
# remove web urls

train = remove_urls(train)

In [14]:
train.head()

Unnamed: 0,type,post
0,INFJ,'url-web
1,INFJ,url-web
2,INFJ,enfp and intj moments url-web sportscenter n...
3,INFJ,What has been the most life-changing experienc...
4,INFJ,url-web url-web On repeat for most of today.


In [15]:
# make lowercase

train['post'] = lower(train)

In [16]:
train.head()

Unnamed: 0,type,post
0,INFJ,'url-web
1,INFJ,url-web
2,INFJ,enfp and intj moments url-web sportscenter n...
3,INFJ,what has been the most life-changing experienc...
4,INFJ,url-web url-web on repeat for most of today.


In [17]:
# remove punctuation

train = remove_punc(train)

In [18]:
train.head()

Unnamed: 0,type,post
0,INFJ,urlweb
1,INFJ,urlweb
2,INFJ,enfp and intj moments urlweb sportscenter no...
3,INFJ,what has been the most lifechanging experience...
4,INFJ,urlweb urlweb on repeat for most of today


In [19]:
# We shuffle the dataframe we just split to reduce noise from the ordering of the posts.

train = train.sample(frac=1).reset_index(drop=True)

In [20]:
train.head()

Unnamed: 0,type,post
0,INTJ,maybe i can make up for it the brief history ...
1,INTJ,archery and i like italso i have practiced swi...
2,ENFP,i confess im afraid of being undeserving c dea...
3,ENTJ,well said i cant think of another way to say i...
4,ISTP,there isnt a person in my life that i would sa...


In [21]:
# create columns for each type

def create_cols(df):
    
    df['first'] = df.apply(lambda row: row['type'][0], axis = 1)
    df['second'] = df.apply(lambda row: row['type'][1], axis = 1)
    df['third'] = df.apply(lambda row: row['type'][2], axis = 1)
    df['fourth'] = df.apply(lambda row: row['type'][3], axis = 1)
    
    return df

In [22]:
train = create_cols(train)

In [23]:
train.head()

Unnamed: 0,type,post,first,second,third,fourth
0,INTJ,maybe i can make up for it the brief history ...,I,N,T,J
1,INTJ,archery and i like italso i have practiced swi...,I,N,T,J
2,ENFP,i confess im afraid of being undeserving c dea...,E,N,F,P
3,ENTJ,well said i cant think of another way to say i...,E,N,T,J
4,ISTP,there isnt a person in my life that i would sa...,I,S,T,P


<h3>I vs E</h3>

In [24]:
classifiers = [LogisticRegression(), MultinomialNB(), 
               GaussianNB(), LDA(), QDA(), DTC(max_depth = 5), 
               RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1)
              ]

names = ['Logistic Regression', 'Multinomial Naive Bayes',
         'Gaussian Naive Bayes', 'LDA', "QDA",
         "Decision Tree", "Random Forest"
        ]

In [25]:
X = train['post']

y1 = train['first']

y2 = train['second']

y3 = train['third']

y4 = train['fourth']

In [26]:
y1[y1 == 'I'].shape

(243222,)

In [27]:
y1[y1 == 'E'].shape

(73326,)

In [28]:
 def fit_all(xtr, ytr, xte, yte):
        
    import time
        
    results = []

    for name, clf in zip(names, classifiers):

        print ('... scoring')
        print('...' + name)
        start = time.time()
        accuracy  = get_accuracy( xtr, ytr, xte, yte, clf)
        f1 = get_f1(xtr, ytr, xte, yte, clf)
        duration = time.time() - start
        print('... done!')
        print()

        results.append([name, accuracy, f1, (accuracy+f1)/2.0, duration])


    results = pd.DataFrame(results, columns=['Classifier', 'Accuracy', 'f1', 'Mean (f1 and accuracy)', 'duration'])
    results.set_index('Classifier', inplace= True)
    
    return results

In [29]:
cv = CountVectorizer(stop_words = 'english', min_df = 0.1)

In [30]:
X = cv.fit_transform(X)

In [31]:
from sklearn.preprocessing import LabelEncoder

In [32]:
le1 = LabelEncoder()
y1 = le1.fit_transform(y1)

le2 = LabelEncoder()
y2 = le2.fit_transform(y2)

le3 = LabelEncoder()
y3 = le3.fit_transform(y3)

le4 = LabelEncoder()
y4 = le4.fit_transform(y4)

In [33]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y1)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X, y2)

X_train3, X_test3, y_train3, y_test3 = train_test_split(X, y3)

X_train4, X_test4, y_train4, y_test4 = train_test_split(X, y4)

In [34]:
X_train2.shape

(237411, 5)

In [35]:
y_train1.shape

(237411,)

In [36]:
y_train1[0]

1

In [37]:
rdfs = [fit_all(X_train1, y_train1, X_test1, y_test1), fit_all(X_train2, y_train2, X_test2, y_test2), 
        fit_all(X_train3, y_train3, X_test3, y_test3), fit_all(X_train4, y_train4, X_test4, y_test4)
       ]

... scoring
...Logistic Regression
... done!

... scoring
...Multinomial Naive Bayes
... done!

... scoring
...Gaussian Naive Bayes
... done!

... scoring
...LDA
... done!

... scoring
...QDA
... done!

... scoring
...Decision Tree
... done!

... scoring
...Random Forest
... done!

... scoring
...Logistic Regression


  'precision', 'predicted', average, warn_for)


... done!

... scoring
...Multinomial Naive Bayes
... done!

... scoring
...Gaussian Naive Bayes
... done!

... scoring
...LDA
... done!

... scoring
...QDA
... done!

... scoring
...Decision Tree
... done!

... scoring
...Random Forest


  'precision', 'predicted', average, warn_for)


... done!

... scoring
...Logistic Regression
... done!

... scoring
...Multinomial Naive Bayes
... done!

... scoring
...Gaussian Naive Bayes
... done!

... scoring
...LDA
... done!

... scoring
...QDA
... done!

... scoring
...Decision Tree
... done!

... scoring
...Random Forest
... done!

... scoring
...Logistic Regression
... done!

... scoring
...Multinomial Naive Bayes
... done!

... scoring
...Gaussian Naive Bayes
... done!

... scoring
...LDA
... done!

... scoring
...QDA
... done!

... scoring
...Decision Tree
... done!

... scoring
...Random Forest
... done!



In [38]:
print('hjkh')

hjkh


In [39]:
pd.concat(rdfs, axis = 0, keys = ['First', 'Second', 'Third', 'Fourth'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Accuracy,f1,Mean (f1 and accuracy),duration
Unnamed: 0_level_1,Classifier,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
First,Logistic Regression,0.76916,0.86952,0.81934,0.634491
First,Multinomial Naive Bayes,0.76916,0.86952,0.81934,0.114032
First,Gaussian Naive Bayes,0.769059,0.869442,0.81925,0.16801
First,LDA,0.76916,0.86952,0.81934,0.247898
First,QDA,0.769021,0.869414,0.819218,0.137647
First,Decision Tree,0.769147,0.869512,0.819329,0.14728
First,Random Forest,0.76916,0.86952,0.81934,0.810233
Second,Logistic Regression,0.863022,0.0,0.431511,0.577717
Second,Multinomial Naive Bayes,0.863022,0.0,0.431511,0.098968
Second,Gaussian Naive Bayes,0.861696,0.004366,0.433031,0.167082


<h3>Therefore, we will be using the following classifiers for each axis:</h3>

<ul>
    <ol><strong>I vs E</strong>: Multinomial Naive Bayes (best avg accuracy and f1 plus is fastest)</ol>
    <ol><strong>N vs S</strong>: Multinomial Naive Bayes (best avg accuracy and f1 plus is fastest)</ol>
    <ol><strong>T vs F</strong>: Qudratic Discriminant Analysis (best avg acc and f1 plus is fastest)</ol>
    <ol><strong>J vs P</strong>: Multinomial Naive Bayes (best avg accuracy and f1 plus is fastest)</ol>
</ul>

In [40]:
selected_classifiers = {'IE':MultinomialNB(), 'NS':MultinomialNB(), 'TF':QDA(), 'JP':MultinomialNB()}

In [41]:
# find the best combination of parameters

mnb = {'alpha': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'fit_prior': (True, False)}

qda = {'priors':[[0.1, 0.9],[0.2, 0.8], [0.3, 0.7], [0.4, 0.6],[0.5, 0.5], [0.6, 0.7], [0.7, 0.3], [0.8, 0.2], [0.9, 0.1]] , 'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8 ,0.9, 1.0]}

In [45]:
from sklearn.model_selection import GridSearchCV

mcl1 = GridSearchCV(selected_classifiers['IE'], mnb)

clf1 = mcl1.fit(X_train1, y_train1)


mcl2 = GridSearchCV(selected_classifiers['NS'], mnb)

clf2 = mcl2.fit(X_train2, y_train2)


qda3 = GridSearchCV(selected_classifiers['TF'], qda)

clf3 = qda3.fit(X_train3.toarray(), y_train3)


mcl4 = GridSearchCV(selected_classifiers['JP'], mnb)

clf4 = mcl4.fit(X_train4, y_train4)

print(clf3)

  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)
  'setting alpha = %.1e' % _ALPHA_MIN)


GridSearchCV(cv=None, error_score='raise',
       estimator=QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=None, tol=0.0001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'priors': [[0.1, 0.9], [0.2, 0.8], [0.3, 0.7], [0.4, 0.6], [0.5, 0.5], [0.6, 0.7], [0.7, 0.3], [0.8, 0.2], [0.9, 0.1]], 'reg_param': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)


  'setting alpha = %.1e' % _ALPHA_MIN)


In [55]:



p1 = clf1.predict(cv.transform(test['posts']))

p2 = clf2.predict(cv.transform(test['posts']))

p3 = clf3.predict(cv.transform(test['posts']).toarray())

p4 = clf4.predict(cv.transform(test['posts']))

In [56]:
test['E/I'] = le1.inverse_transform(p1)

In [57]:
test.head()

Unnamed: 0,type,posts,E/I
0,,Musical Linguistic Logic & Naturalist (tied)|...,I
1,,'You: hello :) Stranger: hii You: how are you ...,I
2,,'What worked for me was knowing that limerence...,I
3,,'Please forget about him. You should definitel...,I
4,,"'Ooh, ENTJ Celestia is actually an interesting...",I


In [58]:
test['N/S'] = le2.inverse_transform(p2)

test['T/F'] = le3.inverse_transform(p3)

test['J/P'] = le4.inverse_transform(p4)

In [60]:
test.drop('type', inplace = True, axis = 1)

In [61]:
test.head()

Unnamed: 0,posts,E/I,N/S,T/F,J/P
0,Musical Linguistic Logic & Naturalist (tied)|...,I,N,F,P
1,'You: hello :) Stranger: hii You: how are you ...,I,N,F,P
2,'What worked for me was knowing that limerence...,I,N,F,P
3,'Please forget about him. You should definitel...,I,N,F,P
4,"'Ooh, ENTJ Celestia is actually an interesting...",I,N,F,P


In [62]:
test.to_csv('submission.csv')