# Pre-Jupyter Notebook Setup

In [6]:
# #Setup
# pyenv virtualenv 3.8.11 mbti_env
# cd ~Documents/Github/MBTI
# pyenv local mbti_env
# pyenv activate mbti_env
# python

# #Create kernel
# python -m ipykernel install --user --name mbti_env

In [None]:
# (Run only once) Install packages
!pip install matplotlib pandas numpy seaborn textstat keras

# Import Libraries

In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Packages only necessary for Feature Extraction
import re
import emoji
from urllib.parse import urlparse
import textstat

In [47]:
# Clarify current working directory
import os
os.getcwd()

'/Users/nathantorento/Documents/GitHub/MBTI'

In [5]:
# Import Dataset
df = pd.read_csv('mbti_1.csv')

# Understanding the Dataset

In [195]:
# See number of rows and columns
df.shape

(8675, 2)

In [196]:
# See columns names
df.columns

Index(['type', 'posts'], dtype='object')

In [197]:
# Preview top of data
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [198]:
# Preview bottom of data
df.tail()

Unnamed: 0,type,posts
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...
8674,INFP,'It has been too long since I have been on per...


In [199]:
# Preview a random sample of the data
df.sample(n=5)

Unnamed: 0,type,posts
5786,INFP,'Are you saying that your love for someone is ...
5207,INFP,'Why I feel like this? 1. Todays world is ful...
27,ISFP,'They paint without numbers|||I'd guess at ist...
4106,INFJ,'When I was about 8 or 9 I started realizing h...
801,INTP,'I get annoyed more easily.|||I hate most rema...


In [200]:
# See column datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [201]:
# See statistical summaries for each column
df.describe()

Unnamed: 0,type,posts
count,8675,8675
unique,16,8675
top,INFP,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
freq,1832,1


# Data Cleaning

In [48]:
# Count the amount of missing data
# Also achieveable with df.info()
df.isnull().sum()

type       0
post       0
emotion    0
dtype: int64

In [203]:
# Check for duplicates
df.duplicated().value_counts()

False    8675
Name: count, dtype: int64

# Feature Extraction

## Text separation
Looking up close at any one post reveals that each entry is actually a long string of 50 posts by the same user, each separated with '|||'. We should separate each post as it's own data entry or row. Remember, one of our ultimate goals is to create a model that most accurately predict the personality type of a user based on one post. Training on data closer to the test will generate more contextually accurate models.

In [18]:
# Examine the posts up close
df.iloc[0,1]

"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...

# Preprocess Object

In [81]:
# Class

class Preprocessor:
    def __init__(self, df=None, altered=False):
        if df is None:
            print("Please provide a dataframe using Preprocessor.fit(df) before continuing.")
        self.df = df
        self.altered = False

    def fit(self, df):
        # self.df = self.split_posts(df)
        return self.df

    # –––––––– CLEANING ––––––––
    # Separate multiple "posts" into their own rows
    def split_posts(self):
        posts_lists = []
        for i, row in self.df.iterrows():
            split_post = row['posts'].split('|||')
            for post in split_post:
                if len(post) != 0: #exclude empty posts
                    posts_lists.append({"type": row['type'], "post": post}) #store each separated post as a dictionary
        self.df = pd.DataFrame(posts_lists)
        
        return self.df
    
    # Convert emojis to text
    def convert_emojis_to_text(self):
        self.df['post'] = self.df['post'].apply(lambda x: emoji.demojize(x))

        self.altered = True
        return self.df


    # –––––––– FEATURE EXTRACTION ––––––––
    # Calculate readability through Flesch-Kincaid metric using 'textstat' package
    def calculate_readability(self):
        self.df['readability'] = self.df['post'].apply(lambda post: textstat.flesch_reading_ease(post))
        
        return self.df


    # Create dummy variables for E/I, S/N, T/F, J/P
    def create_mbti_binaries(self):
        self.df['E/I'] = self.df['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
        self.df['S/N'] = self.df['type'].apply(lambda x: 1 if x[1] == 'S' else 0)
        self.df['T/F'] = self.df['type'].apply(lambda x: 1 if x[2] == 'T' else 0)
        self.df['J/P'] = self.df['type'].apply(lambda x: 1 if x[3] == 'J' else 0)

        return self.df
    
    # Create variable that measures length of post
    def measure_character_length(self):
        self.df['post_length'] = self.df['post'].apply(lambda x: len(x))
        
        return self.df
    
    # Create variable that checks if more than 50% of characters in a post are capitalized
    def check_percentage_caps(self):
        self.df['post_caps_over_50_percent'] = self.df['post'].apply(lambda post: True if sum(1 for c in post if c.isupper())/len(post) > 0.5 else False)
        
        return self.df
    
    # –––––––– FEATURE GROUP: TOP URL EXTRACTION ––––––––
    # Parse URLs in posts to examine the trends
    def extract_base_urls(self):
        extracted_based_urls = self.df['post'].apply(lambda post: urlparse(post).netloc)
        self.df['url'] = extracted_based_urls

        hyperlinks_count = pd.Series(extracted_based_urls).value_counts().reset_index().rename(columns={"post": "base_url"})

        return hyperlinks_count

    # Returns instance of top url or other across data
    def count_url(self, top_urls):
        url_tracker_columns = [f'has_{url}_url' for url in top_urls+['other']]
        
        # Fill in "has_{top_url}_url" columns
        for i in range(len(top_urls)):
            self.df[url_tracker_columns[i]] = self.df['url'].apply(lambda url: 1 if top_urls[i] in url else 0)
        
        # Fill in "has_other_url" columns
        self.df[url_tracker_columns[-1]] = self.df['url'].apply(lambda url: 1 if url != '' and not any(top_url in url for top_url in top_urls) else 0)

        return self.df
    # –––––––––––––––––––––––––––––––––––––––––––––––––––

    def count_mbti_mentions(self): # self_reference, references_others, total_references
        # List of MBTI types
        mbti_types = ['INTJ', 'INFJ', 'INFP', 'INTP', 'ENTJ', 'ENFJ', 'ENFP', 'ENTP', 'ISTJ', 'ISFJ', 'ISFP', 'ISTP', 'ESTJ', 'ESFJ', 'ESFP', 'ESTP']
        
        self.df[f'self_reference'] = self.df.apply(lambda row: row['post'].count(row['type']), axis=1)
        # self.df[f'self_reference'] = self.df.apply(lambda row: sum(row['type'] in row['post']), axis=1)
        # self.df[f'references_others'] = self.df.apply(lambda row: sum(mbti_type in row['post'] for mbti_type in filter(lambda x: x!= row['type'], mbti_types)), axis=1)
        
        return self.df



In [82]:
df = pd.read_csv('mbti_emotions.csv')
df = df.astype(str)

mbti_2017 = Preprocessor(df)
mbti_2017.count_mbti_mentions()

Unnamed: 0,type,post,emotion,self_reference
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,love,0
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,joy,0
2,INFJ,enfp and intj moments https://www.youtube.com...,joy,0
3,INFJ,What has been the most life-changing experienc...,sadness,0
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,joy,0
...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,joy,0
421753,INFP,30 Seconds to Mars - All of my collections. It...,sadness,0
421754,INFP,"I have seen it, and i agree. I did actually th...",joy,0
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,joy,0


In [85]:
mbti_2017.df['self_reference'].value_counts()

self_reference
0    383992
1     32577
2      4545
3       553
4        76
5        10
7         3
6         1
Name: count, dtype: int64

# Data Cleaning Before Feature Extraction

## Split Posts

In [27]:
mbti_2017 = Preprocessor(df)
mbti_2017.split_posts()
mbti_2017.df

Unnamed: 0,type,post
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...
2,INFJ,enfp and intj moments https://www.youtube.com...
3,INFJ,What has been the most life-changing experienc...
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...
...,...,...
421752,INFP,I was going to close my facebook a few months ...
421753,INFP,30 Seconds to Mars - All of my collections. It...
421754,INFP,"I have seen it, and i agree. I did actually th..."
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...


## Convert Emojis to Text

In [28]:
# mbti_2017.convert_emojis_to_text()

# Feature Extraction

## Statistical features using imported packages

### Flesch-Kincaid Readability
Read more: https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

In [29]:
# import textstat
mbti_2017.calculate_readability()

Unnamed: 0,type,post,readability
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,-132.59
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,-640.19
2,INFJ,enfp and intj moments https://www.youtube.com...,48.47
3,INFJ,What has been the most life-changing experienc...,78.25
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,4.14
...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,56.26
421753,INFP,30 Seconds to Mars - All of my collections. It...,96.69
421754,INFP,"I have seen it, and i agree. I did actually th...",86.03
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,84.51


## Manually created features

## Binary Dummies for E/I, S/N, T/F, J/P

In [30]:
mbti_2017.create_mbti_binaries()

Unnamed: 0,type,post,readability,E/I,S/N,T/F,J/P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,-132.59,0,0,0,1
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,-640.19,0,0,0,1
2,INFJ,enfp and intj moments https://www.youtube.com...,48.47,0,0,0,1
3,INFJ,What has been the most life-changing experienc...,78.25,0,0,0,1
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,4.14,0,0,0,1
...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,56.26,0,0,0,0
421753,INFP,30 Seconds to Mars - All of my collections. It...,96.69,0,0,0,0
421754,INFP,"I have seen it, and i agree. I did actually th...",86.03,0,0,0,0
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,84.51,0,0,0,0


## Character length

In [31]:
mbti_2017.measure_character_length()

Unnamed: 0,type,post,readability,E/I,S/N,T/F,J/P,post_length
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,-132.59,0,0,0,1,43
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,-640.19,0,0,0,1,61
2,INFJ,enfp and intj moments https://www.youtube.com...,48.47,0,0,0,1,151
3,INFJ,What has been the most life-changing experienc...,78.25,0,0,0,1,61
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,4.14,0,0,0,1,117
...,...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,56.26,0,0,0,0,193
421753,INFP,30 Seconds to Mars - All of my collections. It...,96.69,0,0,0,0,85
421754,INFP,"I have seen it, and i agree. I did actually th...",86.03,0,0,0,0,199
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,84.51,0,0,0,0,200


## >50% of Text in Caps?

In [32]:
mbti_2017.measure_character_length()

Unnamed: 0,type,post,readability,E/I,S/N,T/F,J/P,post_length,post_caps_over_50_percent
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,-132.59,0,0,0,1,43,False
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,-640.19,0,0,0,1,61,False
2,INFJ,enfp and intj moments https://www.youtube.com...,48.47,0,0,0,1,151,False
3,INFJ,What has been the most life-changing experienc...,78.25,0,0,0,1,61,False
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,4.14,0,0,0,1,117,False
...,...,...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,56.26,0,0,0,0,193,False
421753,INFP,30 Seconds to Mars - All of my collections. It...,96.69,0,0,0,0,85,False
421754,INFP,"I have seen it, and i agree. I did actually th...",86.03,0,0,0,0,199,False
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,84.51,0,0,0,0,200,False


## Hyperlinks
Let's first see whether we have a wide diversity of hyperlinks, or a handful of popular ones, so that we know whether to either to 1. create a category of hyperlinks, or 2. create dummy variables for the most popularly referenced urls vs "others".

Results:
It seems that youtube, tumblr, imgur, and personalitycafe are most popular. Others like wikimedia and vimeo follow closely, but not the same scale, so we can probably just track these 5 variables: youtube, tumblr, imgur, personalitycafe, other_links.


In [35]:
# User will look at code below to identify the top few urls
mbti_2017.extract_base_urls().head(n=20)

Unnamed: 0,base_url,count
0,,406789
1,www.youtube.com,9363
2,youtu.be,629
3,i.imgur.com,396
4,personalitycafe.com,175
5,24.media.tumblr.com,85
6,s-media-cache-ak0.pinimg.com,81
7,uploads.tapatalk-cdn.com,69
8,25.media.tumblr.com,66
9,media.giphy.com,63


In [36]:
# Note: "youtube" will be referred to as youtu for now, for the code to recognize youtu.be as well
top_urls = ['youtu', 'tumblr', 'imgur', 'personalitycafe']
mbti_2017.count_url(top_urls)

Unnamed: 0,type,post,readability,E/I,S/N,T/F,J/P,post_length,post_caps_over_50_percent,url,has_youtu_url,has_tumblr_url,has_imgur_url,has_personalitycafe_url,has_other_url
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,-132.59,0,0,0,1,43,False,,0,0,0,0,0
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,-640.19,0,0,0,1,61,False,41.media.tumblr.com,0,1,0,0,0
2,INFJ,enfp and intj moments https://www.youtube.com...,48.47,0,0,0,1,151,False,,0,0,0,0,0
3,INFJ,What has been the most life-changing experienc...,78.25,0,0,0,1,61,False,,0,0,0,0,0
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,4.14,0,0,0,1,117,False,www.youtube.com,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,56.26,0,0,0,0,193,False,,0,0,0,0,0
421753,INFP,30 Seconds to Mars - All of my collections. It...,96.69,0,0,0,0,85,False,,0,0,0,0,0
421754,INFP,"I have seen it, and i agree. I did actually th...",86.03,0,0,0,0,199,False,,0,0,0,0,0
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,84.51,0,0,0,0,200,False,,0,0,0,0,0


## References to MBTI Types

In [37]:
mbti_2017.count_mbti_mentions()

Unnamed: 0,type,post,readability,E/I,S/N,T/F,J/P,post_length,post_caps_over_50_percent,url,...,ENFP_mentions,ENTP_mentions,ISTJ_mentions,ISFJ_mentions,ISFP_mentions,ISTP_mentions,ESTJ_mentions,ESFJ_mentions,ESFP_mentions,ESTP_mentions
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,-132.59,0,0,0,1,43,False,,...,0,0,0,0,0,0,0,0,0,0
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,-640.19,0,0,0,1,61,False,41.media.tumblr.com,...,0,0,0,0,0,0,0,0,0,0
2,INFJ,enfp and intj moments https://www.youtube.com...,48.47,0,0,0,1,151,False,,...,0,0,0,0,0,0,0,0,0,0
3,INFJ,What has been the most life-changing experienc...,78.25,0,0,0,1,61,False,,...,0,0,0,0,0,0,0,0,0,0
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,4.14,0,0,0,1,117,False,www.youtube.com,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,56.26,0,0,0,0,193,False,,...,0,0,0,0,0,0,0,0,0,0
421753,INFP,30 Seconds to Mars - All of my collections. It...,96.69,0,0,0,0,85,False,,...,0,0,0,0,0,0,0,0,0,0
421754,INFP,"I have seen it, and i agree. I did actually th...",86.03,0,0,0,0,199,False,,...,0,0,0,0,0,0,0,0,0,0
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,84.51,0,0,0,0,200,False,,...,0,0,0,0,0,0,0,0,0,0


## Emotional Analysis
Use a LSTM (long short-term memory) model to predict any one of the six primary emotions according to Dr. Ekman most conveyed by the post, trained on a labeled dataset. Model and dataset referenced from a research article published by researchers in Slovakia and made publicly available online to frontiersin.org.

Read more: https://www.frontiersin.org/articles/10.3389/fpsyg.2023.1190326/full#tab1

Search terms: sentiment analysis 6 or 7 emotions dataset

Extra: Emotion detection model trained using "CNN based on 1D convolution - Conv1D and RNN network - LSTM".

In [10]:
train_emotions_link = 'https://kristina.machova.website.tuke.sk/useful/DATA%20for%20EMOTION%20DETECTION/train%20dataset%20for%20NN%20model.txt'
train_emotions = pd.read_csv(train_emotions_link, sep=';', names=['text', 'emotion'])

train_emotions.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [11]:
# DELETE – Only for testing
df = pd.read_csv('mbti_1.csv')
mbti_2017 = Preprocessor(df)
mbti_2017.split_posts()
df = mbti_2017.df

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, LSTM, Dense


In [13]:
# Tokenize and pad the text data
max_words = 10000  # Maximum number of words to keep
max_len = 100  # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_emotions['text'])

X = tokenizer.texts_to_sequences(train_emotions['text'])
X = pad_sequences(X, maxlen=max_len)
y = pd.get_dummies(train_emotions['emotion'])

# Split the data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Calculate the number of unique emotions (num_classes)
num_classes = len(train_emotions['emotion'].unique())

In [14]:
# Build the CNN model
model = Sequential()
model.add(Embedding(max_words, 128, input_length=max_len))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))  # num_classes is the number of unique emotions

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val)) # marginal improvement after 5 epochs

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x133b40940>

### Apply Model and Classify Post by Emotion
Note. Consider only predicting emotions for posts that aren't just urls.

In [18]:
# Tokenize and pad the text data from the df dataframe
X_test = tokenizer.texts_to_sequences(df['post'])
X_test = pad_sequences(X_test, maxlen=max_len)

# Predict emotions for the df dataset using the trained CNN model
predictions = model.predict(X_test)

# Get the predicted emotions and add them to the df dataframe
predicted_emotions = [y.columns[prediction.argmax()] for prediction in predictions]
df['emotion'] = predicted_emotions



In [25]:
# Store df for easy reference
df2 = df.copy()
df.to_csv('mbti_emotions.csv', index=False)

In [59]:
pd.read_csv('mbti_emotions.csv')

Unnamed: 0,type,post,emotion
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,love
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,joy
2,INFJ,enfp and intj moments https://www.youtube.com...,joy
3,INFJ,What has been the most life-changing experienc...,sadness
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,joy
...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,joy
421753,INFP,30 Seconds to Mars - All of my collections. It...,sadness
421754,INFP,"I have seen it, and i agree. I did actually th...",joy
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,joy


# Optimize Columns
If we look at the number of columns after running all the feature extractions

In [63]:
df = pd.read_csv('mbti_emotions.csv')
df = df.astype(str)

mbti_2017 = Preprocessor(df)
mbti_2017.calculate_readability()
mbti_2017.create_mbti_binaries()
mbti_2017.measure_character_length()
mbti_2017.extract_base_urls().head(n=20)
top_urls = ['youtu', 'tumblr', 'imgur', 'personalitycafe']
mbti_2017.count_url(top_urls)
mbti_2017.count_mbti_mentions()

Unnamed: 0,type,post,emotion,readability,E/I,S/N,T/F,J/P,post_length,url,...,ENFP_mentions,ENTP_mentions,ISTJ_mentions,ISFJ_mentions,ISFP_mentions,ISTP_mentions,ESTJ_mentions,ESFJ_mentions,ESFP_mentions,ESTP_mentions
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,love,-132.59,0,0,0,1,43,,...,0,0,0,0,0,0,0,0,0,0
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,joy,-640.19,0,0,0,1,61,41.media.tumblr.com,...,0,0,0,0,0,0,0,0,0,0
2,INFJ,enfp and intj moments https://www.youtube.com...,joy,48.47,0,0,0,1,151,,...,0,0,0,0,0,0,0,0,0,0
3,INFJ,What has been the most life-changing experienc...,sadness,78.25,0,0,0,1,61,,...,0,0,0,0,0,0,0,0,0,0
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,joy,4.14,0,0,0,1,117,www.youtube.com,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,joy,56.26,0,0,0,0,193,,...,0,0,0,0,0,0,0,0,0,0
421753,INFP,30 Seconds to Mars - All of my collections. It...,sadness,96.69,0,0,0,0,85,,...,0,0,0,0,0,0,0,0,0,0
421754,INFP,"I have seen it, and i agree. I did actually th...",joy,86.03,0,0,0,0,199,,...,0,0,0,0,0,0,0,0,0,0
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,joy,84.51,0,0,0,0,200,,...,0,0,0,0,0,0,0,0,0,0


In [64]:
#
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 421757 entries, 0 to 421756
Data columns (total 31 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   type                     421757 non-null  object 
 1   post                     421757 non-null  object 
 2   emotion                  421757 non-null  object 
 3   readability              421757 non-null  float64
 4   E/I                      421757 non-null  int64  
 5   S/N                      421757 non-null  int64  
 6   T/F                      421757 non-null  int64  
 7   J/P                      421757 non-null  int64  
 8   post_length              421757 non-null  int64  
 9   url                      421757 non-null  object 
 10  has_youtu_url            421757 non-null  int64  
 11  has_tumblr_url           421757 non-null  int64  
 12  has_imgur_url            421757 non-null  int64  
 13  has_personalitycafe_url  421757 non-null  int64  
 14  has_

# Word Predictivity Analysis

In [None]:
!pip install sklearn

### Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Create term-document matrix
term_document_matrix = vectorizer.fit_transform(df['post'])

# Get words from vectorizer
feature_names = vectorizer.get_feature_names_out()

# Create separate dataframe for term-document matrix
term_document_df = pd.DataFrame(term_document_matrix.toarray(), columns=feature_names)

term_document_df

### Train a predictive model on the matrix

In [28]:
from sklearn.model_selection import train_test_split

X = term_document_df 
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

NameError: name 'term_document_df' is not defined

In [120]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=123)
model.fit(X_train, y_train)

predictions = model.predict(X_test)
predictions

In [122]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.38


# EDA

Now that we have all our selected features and a dataset of all variables narrowed-down, we then start comparing them to the target "type". 

We have two targets:
4 Binaries: (E/I...)
Type Groups: 'Type'

EDA is exploring relation
- bar charts (ex: x: E vs I, y: word_count) (ex: or bucket, for 1-5 (how many % are introverted))
- boxplots 

To Do:
    Most predictive words

    Write up an EDA Section:
    3-5 main findings

NOT YET
Try out RandomForest:
5 different models with each a different target

# Identity Matrix

In [135]:
# Create an identity matrix with the length of the vocabulary
identity_matrix = np.eye(len(feature_names))
identity_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

## Predicting the Identity Matrix Using the Models

In [131]:
# Transform the identity matrix using the same CountVectorizer
identity_matrix_transformed = vectorizer.transform([' '.join(feature_names)])

In [133]:
model.predict(identity_matrix_transformed)

array(['INFJ'], dtype=object)

In [134]:
identity_matrix_transformed

<1x145469 sparse matrix of type '<class 'numpy.int64'>'
	with 145469 stored elements in Compressed Sparse Row format>