# Pre-Jupyter Notebook Setup

In [6]:
# #Setup
# pyenv virtualenv 3.8.11 mbti_env
# cd ~Documents/Github/MBTI
# pyenv local mbti_env
# pyenv activate mbti_env
# python

# #Create kernel
# python -m ipykernel install --user --name mbti_env

In [None]:
# (Run only once) Install packages
!pip install matplotlib pandas numpy seaborn re

# Import Libraries

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [13]:
# Clarify current working directory
import os
os.getcwd()

'/Users/nathantorento/Documents/GitHub/MBTI'

In [9]:
# Import Dataset

In [63]:
df = pd.read_csv('mbti_1.csv')

# Understanding the Dataset

In [18]:
# See number of rows and columns
df.shape

(8675, 2)

In [17]:
# See columns names
df.columns

Index(['type', 'posts'], dtype='object')

In [15]:
# Preview top of data
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [19]:
# Preview bottom of data
df.tail()

Unnamed: 0,type,posts
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...
8674,INFP,'It has been too long since I have been on per...


In [20]:
# Preview a random sample of the data
df.sample(n=5)

Unnamed: 0,type,posts
3830,ENFP,"'Dear Anonymous, Thank you for your help toda..."
5766,INTJ,"'My first ones; did those go through?|||Hey, d..."
792,ENTP,'Heh. That one came out of a lady who would se...
6336,INFP,'Turtles mating are really cute though|||Their...
5414,ENTP,"'Welcome home, sonny :laughing:|||Just because..."


In [22]:
# See column datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [23]:
# See statistical summaries for each column
df.describe()

Unnamed: 0,type,posts
count,8675,8675
unique,16,8675
top,INFP,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
freq,1832,1


# Data Cleaning

In [27]:
# Count the amount of missing data
# Also achieveable with df.info()
df.isnull().sum()

type     0
posts    0
dtype: int64

In [31]:
# Check for duplicates
df.duplicated().value_counts()

False    8675
Name: count, dtype: int64

# Feature Extraction

In [52]:
import emoji
import re

## Text separation
Looking up close at a post reveals that each entry is actually a long string of multiple posts, each separated with '|||'. It would serve us well to create a column containing a list of all the posts, this time, separated.

In [34]:
# Examine the posts up close
df.iloc[0,1]

"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...

In [64]:
def split_posts(df):
    df['posts_list'] = df['posts'].apply(lambda x: x.split('|||'))
    return df
df = split_posts(df)
df.head()

Unnamed: 0,type,posts,posts_list
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ..."
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ..."
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce..."


## Binary Dummies for E/I, S/N, T/F, J/P

In [68]:
# Dummy variables for E/I, S/N, T/F, J/P
def create_dummy_variables(df):
    df['E/I'] = df['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
    df['S/N'] = df['type'].apply(lambda x: 1 if x[1] == 'S' else 0)
    df['T/F'] = df['type'].apply(lambda x: 1 if x[2] == 'T' else 0)
    df['J/P'] = df['type'].apply(lambda x: 1 if x[3] == 'J' else 0)
    return df

df = create_dummy_variables(df)

df.head()

Unnamed: 0,type,posts,posts_list,E/I,S/N,T/F,J/P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ...",0,0,0,1
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...,1,0,1,0
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...,0,0,1,0
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ...",0,0,1,1
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce...",1,0,1,1


## Amount of Repetition

In [69]:
# # Amount of repetition
# def calculate_repetition(posts):
#     # Assuming posts is a string separated by '|||'
#     post_list = posts.split('|||')
#     repetitions = [len(post) - len(set(post)) for post in post_list]
#     return sum(repetitions)

# df['repetition'] = df['posts'].apply(calculate_repetition)
# df.head()

Unnamed: 0,type,posts,posts_list,E/I,S/N,T/F,J/P,repetition
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ...",0,0,0,1,3200
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...,1,0,1,0,5477
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...,0,0,1,0,3920
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ...",0,0,1,1,4828
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce...",1,0,1,1,4671


## Measure character length

In [70]:
# Measure the sum of characters of all posts in "posts_list"
def measure_character_length(posts_list):
    # Assuming posts_list is a list of strings
    total_length = sum(len(post) for post in posts_list)
    return total_length

df['posts_total_length'] = df['posts_list'].apply(measure_character_length)
df.head()

Unnamed: 0,type,posts,posts_list,E/I,S/N,T/F,J/P,repetition,posts_total_length
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ...",0,0,0,1,3200,4505
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...,1,0,1,0,5477,6906
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...,0,0,1,0,3920,5118
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ...",0,0,1,1,4828,6124
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce...",1,0,1,1,4671,5964


In [71]:
# Convert emojis to their text version
def convert_emojis_to_text(posts_list):
    # Assuming posts_list is a list of strings
    converted_posts = []
    for post in posts_list:
        converted_post = emoji.demojize(post)
        converted_posts.append(converted_post)
    return converted_posts

df['posts_list'] = df['posts_list'].apply(convert_emojis_to_text)
df.head()

Unnamed: 0,type,posts,posts_list,E/I,S/N,T/F,J/P,repetition,posts_total_length
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ...",0,0,0,1,3200,4505
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...,1,0,1,0,5477,6906
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...,0,0,1,0,3920,5118
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ...",0,0,1,1,4828,6124
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce...",1,0,1,1,4671,5964


In [73]:
def check_percentage_caps(posts_list):
    # Concatenate all posts into a single string
    all_posts = ''.join(posts_list)
    
    total_chars = len(all_posts)
    capitalized_chars = sum(1 for c in all_posts if c.isupper())
    
    if total_chars > 0:
        percentage_caps = (capitalized_chars / total_chars) * 100
    else:
        percentage_caps = 0
        
    return 1 if percentage_caps > 50 else 0


df['posts_caps_over_50_percent'] = df['posts_list'].apply(check_percentage_caps)
df.head()

Unnamed: 0,type,posts,posts_list,E/I,S/N,T/F,J/P,repetition,posts_total_length,percentage_caps,posts_caps_over_50_percent
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ...",0,0,0,1,3200,4505,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...,1,0,1,0,5477,6906,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...,0,0,1,0,3920,5118,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ...",0,0,1,1,4828,6124,"[0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, ...",0
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce...",1,0,1,1,4671,5964,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


## References to MBTI Types

In [90]:
def count_mbti_mentions(df):
    # List of MBTI types
    mbti_types = ['INTJ', 'INFJ', 'INFP', 'INTP', 'ENTJ', 'ENFJ', 'ENFP', 'ENTP', 'ISTJ', 'ISFJ', 'ISFP', 'ISTP', 'ESTJ', 'ESFJ', 'ESFP', 'ESTP']
    
    # Create counter variables for each MBTI type
    for mbti_type in mbti_types:
        df[f'{mbti_type}_mentions'] = df['posts_list'].apply(lambda x: sum(1 for post in x if mbti_type in post))
    
    return df

df = count_mbti_mentions(df)
df.head()

Unnamed: 0,type,posts,posts_list,E/I,S/N,T/F,J/P,repetition,posts_total_length,percentage_caps,...,ENFP_mentions,ENTP_mentions,ISTJ_mentions,ISFJ_mentions,ISFP_mentions,ISTP_mentions,ESTJ_mentions,ESFJ_mentions,ESFP_mentions,ESTP_mentions
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...,"['http://www.youtube.com/watch?v=qsXHcwe3krw, ...",0,0,0,1,3200,4505,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,1,0,0,0,0,0,0,0,0,0
1,ENTP,'I'm finding the lack of me in these posts ver...,['I'm finding the lack of me in these posts ve...,1,0,1,0,5477,6906,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,0,7,0,0,0,0,2,0,0,0
2,INTP,'Good one _____ https://www.youtube.com/wat...,['Good one _____ https://www.youtube.com/wa...,0,0,1,0,3920,5118,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,0,0,0,0,0,1,0,0,0,0
3,INTJ,"'Dear INTP, I enjoyed our conversation the o...","['Dear INTP, I enjoyed our conversation the ...",0,0,1,1,4828,6124,"[0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, ...",...,0,1,0,0,0,0,0,3,0,0
4,ENTJ,'You're fired.|||That's another silly misconce...,"['You're fired., That's another silly misconce...",1,0,1,1,4671,5964,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",...,0,0,0,0,0,0,0,0,0,0


## Sentiment Analysis
Train model with existing twitter data that has already been classified, where the 1st column identifies the polarity as such
    0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

Source: http://help.sentiment140.com/for-students/

Reference: https://www.baeldung.com/cs/sentiment-analysis-training-data

"Sentiment 140 contains an impressive 1,600,000 tweets from various English-speaker users"

In [88]:
import gdown
import zipfile

# Google Drive file ID from the shared link
file_id = '0B04GJPshIjmPRnZManQwWEdTZjg'

# Destination file path after downloading and extracting the zip file
output_path = 'training.1600000.processed.noemoticon.csv'

# Download the zip file
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'trainingandtestdata.zip', quiet=False)

# Extract the zip file
with zipfile.ZipFile('trainingandtestdata.zip', 'r') as zip_ref:
    zip_ref.extractall('temp_folder')

# Find the CSV file in the extracted folder and read it into a pandas DataFrame
csv_file_path = os.path.join('temp_folder', 'training.1600000.processed.noemoticon.csv')
sentiment_df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')

# Clean up: Remove temporary files and folders
for file_name in os.listdir('temp_folder'):
    file_path = os.path.join('temp_folder', file_name)
    if os.path.isfile(file_path):
        os.remove(file_path)
    else:
        os.rmdir(file_path)

os.remove('trainingandtestdata.zip')
os.rmdir('temp_folder')

Downloading...
From (uriginal): https://drive.google.com/uc?id=0B04GJPshIjmPRnZManQwWEdTZjg
From (redirected): https://drive.google.com/uc?id=0B04GJPshIjmPRnZManQwWEdTZjg&confirm=t&uuid=d024c15b-58bf-479d-9b9d-b1970edf9f54
To: /Users/nathantorento/Documents/GitHub/MBTI/trainingandtestdata.zip
100%|██████████| 81.4M/81.4M [00:06<00:00, 12.0MB/s]


In [89]:
# Preview sentiment training data
sentiment_df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


## Emotional Analysis

Reference: https://www.baeldung.com/cs/ml-emotion-detection#public-datasets-for-emotion-detection

Source: https://web.eecs.umich.edu/~mihalcea/affectivetext/#datasets

In [92]:
import requests
import tarfile
import os

# URL of the .tar.gz file
url = 'http://web.eecs.umich.edu/~mihalcea/downloads/AffectiveText.Semeval.2007.tar.gz'

# Download the file
response = requests.get(url, stream=True)
file_name = os.path.basename(url)

# Save the downloaded file
with open(file_name, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

# Extract the .tar.gz file
with tarfile.open(file_name, 'r:gz') as tar:
    tar.extractall()

# Clean up: remove the downloaded .tar.gz file
os.remove(file_name)

# Word Predictivity Analysis

In [None]:
!pip install sklearn

In [95]:
from sklearn.feature_extraction.text import CountVectorizer

### Count Vectorizer

In [101]:
# Create a new DataFrame with only a 'posts_text' column for testing purposes
df2 = pd.DataFrame()
df2['type'] = df['type'].copy()
df2['posts_text'] = df['posts_list'].apply(lambda x: ' '.join(x))
df2.head()

Unnamed: 0,type,posts_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw ht...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired. That's another silly misconcept...


In [113]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Create term-document matrix
term_document_matrix = vectorizer.fit_transform(df2['posts_text'])

# Get words from vectorizer
feature_names = vectorizer.get_feature_names_out()

# Create separate dataframe for term-document matrix
term_document_df = pd.DataFrame(term_document_matrix.toarray(), columns=feature_names)

term_document_df

Unnamed: 0,00,000,0000,000000,0000000000,000000000000000,00000000000000000000000000000000027,00000011,000000111,0000001111,...,ｓｏ,ｔｒｕｍｐu3000ｉｓu3000ａｎu3000ｅｓｔｐ,ｖａｐｏｒｗａｖｅ,ｗｈｙu3000ｉｓu3000ａｎｙｏｎｅu3000ｓｔｉｌｌu3000ｄｉｓｃｕｓｓｉｎｇu3000ｔｈｉｓ,ﾉωﾉ,ﾉｼ,ﾉﾞ,ﾉﾟ,ﾟдﾟщ,ﾟﾟ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train a predictive model on the matrix

In [116]:
from sklearn.model_selection import train_test_split

X = term_document_df 
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [120]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=123)
model.fit(X_train, y_train)

In [121]:
predictions = model.predict(X_test)
predictions

array(['INFP', 'INTP', 'INTP', ..., 'INFP', 'INFP', 'INFP'], dtype=object)

In [122]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.38


# Identity Matrix

In [135]:
# Create an identity matrix with the length of the vocabulary
identity_matrix = np.eye(len(feature_names))
identity_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

## Predicting the Identity Matrix Using the Models

In [131]:
# Transform the identity matrix using the same CountVectorizer
identity_matrix_transformed = vectorizer.transform([' '.join(feature_names)])

In [133]:
model.predict(identity_matrix_transformed)

array(['INFJ'], dtype=object)

In [134]:
identity_matrix_transformed

<1x145469 sparse matrix of type '<class 'numpy.int64'>'
	with 145469 stored elements in Compressed Sparse Row format>