# Pre-Jupyter Notebook Setup

In [6]:
# #Setup
# pyenv virtualenv 3.8.11 mbti_env
# cd ~Documents/Github/MBTI
# pyenv local mbti_env
# pyenv activate mbti_env
# python

# #Create kernel
# python -m ipykernel install --user --name mbti_env

In [None]:
# (Run only once) Install packages
!pip install matplotlib pandas numpy seaborn

# Import Libraries

In [192]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [193]:
# Clarify current working directory
import os
os.getcwd()

'/Users/nathantorento/Documents/GitHub/MBTI'

In [194]:
# Import Dataset
df = pd.read_csv('mbti_1.csv')

# Understanding the Dataset

In [195]:
# See number of rows and columns
df.shape

(8675, 2)

In [196]:
# See columns names
df.columns

Index(['type', 'posts'], dtype='object')

In [197]:
# Preview top of data
df.head()

Unnamed: 0,type,posts
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired.|||That's another silly misconce...


In [198]:
# Preview bottom of data
df.tail()

Unnamed: 0,type,posts
8670,ISFP,'https://www.youtube.com/watch?v=t8edHB_h908||...
8671,ENFP,'So...if this thread already exists someplace ...
8672,INTP,'So many questions when i do these things. I ...
8673,INFP,'I am very conflicted right now when it comes ...
8674,INFP,'It has been too long since I have been on per...


In [199]:
# Preview a random sample of the data
df.sample(n=5)

Unnamed: 0,type,posts
5786,INFP,'Are you saying that your love for someone is ...
5207,INFP,'Why I feel like this? 1. Todays world is ful...
27,ISFP,'They paint without numbers|||I'd guess at ist...
4106,INFJ,'When I was about 8 or 9 I started realizing h...
801,INTP,'I get annoyed more easily.|||I hate most rema...


In [200]:
# See column datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8675 entries, 0 to 8674
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   type    8675 non-null   object
 1   posts   8675 non-null   object
dtypes: object(2)
memory usage: 135.7+ KB


In [201]:
# See statistical summaries for each column
df.describe()

Unnamed: 0,type,posts
count,8675,8675
unique,16,8675
top,INFP,'http://www.youtube.com/watch?v=qsXHcwe3krw|||...
freq,1832,1


# Data Cleaning

In [202]:
# Count the amount of missing data
# Also achieveable with df.info()
df.isnull().sum()

type     0
posts    0
dtype: int64

In [203]:
# Check for duplicates
df.duplicated().value_counts()

False    8675
Name: count, dtype: int64

# Feature Extraction

In [204]:
# Packages only necessary for Feature Extraction
import re
import emoji
from urllib.parse import urlparse

## Text separation
Looking up close at any one post reveals that each entry is actually a long string of 50 posts by the same user, each separated with '|||'. We should separate each post as it's own data entry or row. Remember, one of our ultimate goals is to create a model that most accurately predict the personality type of a user based on one post. Training on data closer to the test will generate more contextually accurate models.

In [18]:
# Examine the posts up close
df.iloc[0,1]

"'http://www.youtube.com/watch?v=qsXHcwe3krw|||http://41.media.tumblr.com/tumblr_lfouy03PMA1qa1rooo1_500.jpg|||enfp and intj moments  https://www.youtube.com/watch?v=iz7lE1g4XM4  sportscenter not top ten plays  https://www.youtube.com/watch?v=uCdfze1etec  pranks|||What has been the most life-changing experience in your life?|||http://www.youtube.com/watch?v=vXZeYwwRDw8   http://www.youtube.com/watch?v=u8ejam5DP3E  On repeat for most of today.|||May the PerC Experience immerse you.|||The last thing my INFJ friend posted on his facebook before committing suicide the next day. Rest in peace~   http://vimeo.com/22842206|||Hello ENFJ7. Sorry to hear of your distress. It's only natural for a relationship to not be perfection all the time in every moment of existence. Try to figure the hard times as times of growth, as...|||84389  84390  http://wallpaperpassion.com/upload/23700/friendship-boy-and-girl-wallpaper.jpg  http://assets.dornob.com/wp-content/uploads/2010/04/round-home-design.jpg ...

# Preprocess Object

In [419]:
# Class

class Preprocessor:
    def __init__(self, df=None, altered=False):
        if df is None:
            print("Please provide a dataframe using Preprocessor.fit(df) before continuing.")
        self.df = df
        self.altered = False

    def fit(self, df):
        # self.df = self.split_posts(df)
        return self.df

    # –––––––– CLEANING ––––––––
    # Separate multiple "posts" into their own rows
    def split_posts(self):
        posts_lists = []
        for i, row in self.df.iterrows():
            split_post = row['posts'].split('|||')
            for post in split_post:
                if len(post) != 0: #exclude empty posts
                    posts_lists.append({"type": row['type'], "post": post}) #store each separated post as a dictionary
        self.df = pd.DataFrame(posts_lists)
        
        return self.df
    
    # Convert emojis to text
    def convert_emojis_to_text(self):
        self.df['post'] = self.df['post'].apply(lambda x: emoji.demojize(x))

        self.altered = True
        return self.df


    # –––––––– FEATURE EXTRACTION ––––––––
    # Create dummy variables for E/I, S/N, T/F, J/P
    def create_mbti_binaries(self):
        self.df['E/I'] = self.df['type'].apply(lambda x: 1 if x[0] == 'E' else 0)
        self.df['S/N'] = self.df['type'].apply(lambda x: 1 if x[1] == 'S' else 0)
        self.df['T/F'] = self.df['type'].apply(lambda x: 1 if x[2] == 'T' else 0)
        self.df['J/P'] = self.df['type'].apply(lambda x: 1 if x[3] == 'J' else 0)

        return self.df
    
    # Create variable that measures length of post
    def measure_character_length(self):
        self.df['post_length'] = self.df['post'].apply(lambda x: len(x))
        
        return self.df
    
    # Create variable that checks if more than 50% of characters in a post are capitalized
    def check_percentage_caps(self):
        self.df['post_caps_over_50_percent'] = self.df['post'].apply(lambda post: True if sum(1 for c in post if c.isupper())/len(post) > 0.5 else False)
        
        return self.df
    
    # –––––––– FEATURE GROUP: TOP URL EXTRACTION ––––––––
    # Parse URLs in posts to examine the trends
    def extract_base_urls(self):
        extracted_based_urls = self.df['post'].apply(lambda post: urlparse(post).netloc)
        self.df['url'] = extracted_based_urls

        hyperlinks_count = pd.Series(extracted_based_urls).value_counts().reset_index().rename(columns={"post": "base_url"})

        return hyperlinks_count

    # Returns instance of top url or other across data
    def count_url(self, top_urls):
        url_tracker_columns = [f'has_{url}_url' for url in top_urls+['other']]
        
        # Fill in "has_{top_url}_url" columns
        for i in range(len(top_urls)):
            self.df[url_tracker_columns[i]] = self.df['url'].apply(lambda url: 1 if top_urls[i] in url else 0)
        
        # Fill in "has_other_url" columns
        self.df[url_tracker_columns[-1]] = self.df['url'].apply(lambda url: 1 if url != '' and not any(top_url in url for top_url in top_urls) else 0)

        return self.df
    # –––––––––––––––––––––––––––––––––––––––––––––––––––

    def count_mbti_mentions(self):
        # List of MBTI types
        mbti_types = ['INTJ', 'INFJ', 'INFP', 'INTP', 'ENTJ', 'ENFJ', 'ENFP', 'ENTP', 'ISTJ', 'ISFJ', 'ISFP', 'ISTP', 'ESTJ', 'ESFJ', 'ESFP', 'ESTP']
        
        # Create counter variables for each MBTI type
        for mbti_type in mbti_types:
            self.df[f'{mbti_type}_mentions'] = self.df['post'].apply(lambda post: 1 if mbti_type in post else 0)
        
        return self.df



# Data Cleaning Before Feature Extraction

## Split Posts

In [411]:
mbti_2017 = Preprocessor(df)
mbti_2017.split_posts()
mbti_2017.df

Unnamed: 0,type,post
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...
2,INFJ,enfp and intj moments https://www.youtube.com...
3,INFJ,What has been the most life-changing experienc...
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...
...,...,...
421752,INFP,I was going to close my facebook a few months ...
421753,INFP,30 Seconds to Mars - All of my collections. It...
421754,INFP,"I have seen it, and i agree. I did actually th..."
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...


## Convert Emojis to Text

In [412]:
# mbti_2017.convert_emojis_to_text()

# Feature Extraction

## Binary Dummies for E/I, S/N, T/F, J/P

In [413]:
mbti_2017.create_mbti_binaries()

Unnamed: 0,type,post,E/I,S/N,T/F,J/P
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,0,0,0,1
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,0,0,0,1
2,INFJ,enfp and intj moments https://www.youtube.com...,0,0,0,1
3,INFJ,What has been the most life-changing experienc...,0,0,0,1
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,0,0,0,1
...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,0,0,0,0
421753,INFP,30 Seconds to Mars - All of my collections. It...,0,0,0,0
421754,INFP,"I have seen it, and i agree. I did actually th...",0,0,0,0
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,0,0,0,0


## Character length

In [414]:
mbti_2017.measure_character_length()

Unnamed: 0,type,post,E/I,S/N,T/F,J/P,post_length
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,0,0,0,1,43
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,0,0,0,1,61
2,INFJ,enfp and intj moments https://www.youtube.com...,0,0,0,1,151
3,INFJ,What has been the most life-changing experienc...,0,0,0,1,61
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,0,0,0,1,117
...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,0,0,0,0,193
421753,INFP,30 Seconds to Mars - All of my collections. It...,0,0,0,0,85
421754,INFP,"I have seen it, and i agree. I did actually th...",0,0,0,0,199
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,0,0,0,0,200


## >50% of Text in Caps?

In [415]:
mbti_2017.check_percentage_caps()

Unnamed: 0,type,post,E/I,S/N,T/F,J/P,post_length,post_caps_over_50_percent
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,0,0,0,1,43,False
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,0,0,0,1,61,False
2,INFJ,enfp and intj moments https://www.youtube.com...,0,0,0,1,151,False
3,INFJ,What has been the most life-changing experienc...,0,0,0,1,61,False
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,0,0,0,1,117,False
...,...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,0,0,0,0,193,False
421753,INFP,30 Seconds to Mars - All of my collections. It...,0,0,0,0,85,False
421754,INFP,"I have seen it, and i agree. I did actually th...",0,0,0,0,199,False
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,0,0,0,0,200,False


## Hyperlinks
Let's first see whether we have a wide diversity of hyperlinks, or a handful of popular ones, so that we know whether to either to 1. create a category of hyperlinks, or 2. create dummy variables for the most popularly referenced urls vs "others".

Results:
It seems that youtube, tumblr, imgur, and personalitycafe are most popular. Others like wikimedia and vimeo follow closely, but not the same scale, so we can probably just track these 5 variables: youtube, tumblr, imgur, personalitycafe, other_links.


In [416]:
# import re 
# # Regex
# # Source: https://stackoverflow.com/questions/839994/extracting-a-url-in-python
# link_regex = '\b((?:https?://)?(?:(?:www\.)?(?:[\da-z\.-]+)\.(?:[a-z]{2,6})|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)|(?:(?:[0-9a-fA-F]{1,4}:){7,7}[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,7}:|(?:[0-9a-fA-F]{1,4}:){1,6}:[0-9a-fA-F]{1,4}|(?:[0-9a-fA-F]{1,4}:){1,5}(?::[0-9a-fA-F]{1,4}){1,2}|(?:[0-9a-fA-F]{1,4}:){1,4}(?::[0-9a-fA-F]{1,4}){1,3}|(?:[0-9a-fA-F]{1,4}:){1,3}(?::[0-9a-fA-F]{1,4}){1,4}|(?:[0-9a-fA-F]{1,4}:){1,2}(?::[0-9a-fA-F]{1,4}){1,5}|[0-9a-fA-F]{1,4}:(?:(?::[0-9a-fA-F]{1,4}){1,6})|:(?:(?::[0-9a-fA-F]{1,4}){1,7}|:)|fe80:(?::[0-9a-fA-F]{0,4}){0,4}%[0-9a-zA-Z]{1,}|::(?:ffff(?::0{1,4}){0,1}:){0,1}(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])|(?:[0-9a-fA-F]{1,4}:){1,4}:(?:(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])\.){3,3}(?:25[0-5]|(?:2[0-4]|1{0,1}[0-9]){0,1}[0-9])))(?::[0-9]{1,4}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5])?(?:/[\w\.-]*)*/?)\b'

# mbti_2017.df['post'].apply(lambda x: re.findall(link_regex, x))

In [417]:
# For testing
# top_urls = ['youtube', 'tumblr', 'imgur', 'personalitycafe', 'others']
# top_urls_df = pd.DataFrame({'top_url': top_urls, 'count': [0] * len(top_urls)})

# all_urls_df = mbti_2017.extract_base_urls()

# for _, row in all_urls_df.iterrows():
#     for i in range(len(top_urls)-1): # -1 to exclude "others"
#         if top_urls[i] in row['base_url']:
#             top_urls_df.at[i, 'count'] += row['count']
#         elif row['base_url'] != '': # only add if not empty
#             top_urls_df.at[len(top_urls)-1, 'count'] += row['count']

# top_urls_df

In [418]:
# User will look at code below to identify the top few urls
mbti_2017.extract_base_urls().head(n=20)

Unnamed: 0,base_url,count
0,,406789
1,www.youtube.com,9363
2,youtu.be,629
3,i.imgur.com,396
4,personalitycafe.com,175
5,24.media.tumblr.com,85
6,s-media-cache-ak0.pinimg.com,81
7,uploads.tapatalk-cdn.com,69
8,25.media.tumblr.com,66
9,media.giphy.com,63


In [410]:
# Note: "youtube" will be referred to as youtu for now, for the code to recognize youtu.be as well
top_urls = ['youtu', 'tumblr', 'imgur', 'personalitycafe']
mbti_2017.count_url(top_urls)

Unnamed: 0,type,post,url,has_youtube_url,has_tumblr_url,has_imgur_url,has_personalitycafe_url,has_other_url,has_youtu_url
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,,0,0,0,0,0,0
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,41.media.tumblr.com,0,1,0,0,0,0
2,INFJ,enfp and intj moments https://www.youtube.com...,,0,0,0,0,0,0
3,INFJ,What has been the most life-changing experienc...,,0,0,0,0,0,0
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,www.youtube.com,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,,0,0,0,0,0,0
421753,INFP,30 Seconds to Mars - All of my collections. It...,,0,0,0,0,0,0
421754,INFP,"I have seen it, and i agree. I did actually th...",,0,0,0,0,0,0
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,,0,0,0,0,0,0


## References to MBTI Types

In [421]:
mbti_2017.count_mbti_mentions()

Unnamed: 0,type,post,INTJ_mentions,INFJ_mentions,INFP_mentions,INTP_mentions,ENTJ_mentions,ENFJ_mentions,ENFP_mentions,ENTP_mentions,ISTJ_mentions,ISFJ_mentions,ISFP_mentions,ISTP_mentions,ESTJ_mentions,ESFJ_mentions,ESFP_mentions,ESTP_mentions
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,INFJ,http://41.media.tumblr.com/tumblr_lfouy03PMA1q...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,INFJ,enfp and intj moments https://www.youtube.com...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,INFJ,What has been the most life-changing experienc...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,INFJ,http://www.youtube.com/watch?v=vXZeYwwRDw8 h...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421752,INFP,I was going to close my facebook a few months ...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
421753,INFP,30 Seconds to Mars - All of my collections. It...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
421754,INFP,"I have seen it, and i agree. I did actually th...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
421755,INFP,Ok so i have just watched Underworld 4 (Awaken...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Sentiment Analysis
Train model with existing twitter data that has already been classified, where the 1st column identifies the polarity as such
    0 - the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

Source: http://help.sentiment140.com/for-students/

Reference: https://www.baeldung.com/cs/sentiment-analysis-training-data

"Sentiment 140 contains an impressive 1,600,000 tweets from various English-speaker users"

In [88]:
import gdown
import zipfile

# Google Drive file ID from the shared link
file_id = '0B04GJPshIjmPRnZManQwWEdTZjg'

# Destination file path after downloading and extracting the zip file
output_path = 'training.1600000.processed.noemoticon.csv'

# Download the zip file
gdown.download(f'https://drive.google.com/uc?id={file_id}', 'trainingandtestdata.zip', quiet=False)

# Extract the zip file
with zipfile.ZipFile('trainingandtestdata.zip', 'r') as zip_ref:
    zip_ref.extractall('temp_folder')

# Find the CSV file in the extracted folder and read it into a pandas DataFrame
csv_file_path = os.path.join('temp_folder', 'training.1600000.processed.noemoticon.csv')
sentiment_df = pd.read_csv(csv_file_path, encoding='ISO-8859-1')

# Clean up: Remove temporary files and folders
for file_name in os.listdir('temp_folder'):
    file_path = os.path.join('temp_folder', file_name)
    if os.path.isfile(file_path):
        os.remove(file_path)
    else:
        os.rmdir(file_path)

os.remove('trainingandtestdata.zip')
os.rmdir('temp_folder')

Downloading...
From (uriginal): https://drive.google.com/uc?id=0B04GJPshIjmPRnZManQwWEdTZjg
From (redirected): https://drive.google.com/uc?id=0B04GJPshIjmPRnZManQwWEdTZjg&confirm=t&uuid=d024c15b-58bf-479d-9b9d-b1970edf9f54
To: /Users/nathantorento/Documents/GitHub/MBTI/trainingandtestdata.zip
100%|██████████| 81.4M/81.4M [00:06<00:00, 12.0MB/s]


In [89]:
# Preview sentiment training data
sentiment_df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


## Emotional Analysis

Reference: https://www.baeldung.com/cs/ml-emotion-detection#public-datasets-for-emotion-detection

Source: https://web.eecs.umich.edu/~mihalcea/affectivetext/#datasets

In [92]:
import requests
import tarfile
import os

# URL of the .tar.gz file
url = 'http://web.eecs.umich.edu/~mihalcea/downloads/AffectiveText.Semeval.2007.tar.gz'

# Download the file
response = requests.get(url, stream=True)
file_name = os.path.basename(url)

# Save the downloaded file
with open(file_name, 'wb') as f:
    for chunk in response.iter_content(chunk_size=8192):
        f.write(chunk)

# Extract the .tar.gz file
with tarfile.open(file_name, 'r:gz') as tar:
    tar.extractall()

# Clean up: remove the downloaded .tar.gz file
os.remove(file_name)

# Word Predictivity Analysis

In [None]:
!pip install sklearn

In [95]:
from sklearn.feature_extraction.text import CountVectorizer

### Count Vectorizer

In [101]:
# Create a new DataFrame with only a 'posts_text' column for testing purposes
df2 = pd.DataFrame()
df2['type'] = df['type'].copy()
df2['posts_text'] = df['posts_list'].apply(lambda x: ' '.join(x))
df2.head()

Unnamed: 0,type,posts_text
0,INFJ,'http://www.youtube.com/watch?v=qsXHcwe3krw ht...
1,ENTP,'I'm finding the lack of me in these posts ver...
2,INTP,'Good one _____ https://www.youtube.com/wat...
3,INTJ,"'Dear INTP, I enjoyed our conversation the o..."
4,ENTJ,'You're fired. That's another silly misconcept...


In [113]:
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Create term-document matrix
term_document_matrix = vectorizer.fit_transform(df2['posts_text'])

# Get words from vectorizer
feature_names = vectorizer.get_feature_names_out()

# Create separate dataframe for term-document matrix
term_document_df = pd.DataFrame(term_document_matrix.toarray(), columns=feature_names)

term_document_df

Unnamed: 0,00,000,0000,000000,0000000000,000000000000000,00000000000000000000000000000000027,00000011,000000111,0000001111,...,ｓｏ,ｔｒｕｍｐu3000ｉｓu3000ａｎu3000ｅｓｔｐ,ｖａｐｏｒｗａｖｅ,ｗｈｙu3000ｉｓu3000ａｎｙｏｎｅu3000ｓｔｉｌｌu3000ｄｉｓｃｕｓｓｉｎｇu3000ｔｈｉｓ,ﾉωﾉ,ﾉｼ,ﾉﾞ,ﾉﾟ,ﾟдﾟщ,ﾟﾟ
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8670,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8673,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Train a predictive model on the matrix

In [116]:
from sklearn.model_selection import train_test_split

X = term_document_df 
y = df['type']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

In [120]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=123)
model.fit(X_train, y_train)

In [121]:
predictions = model.predict(X_test)
predictions

array(['INFP', 'INTP', 'INTP', ..., 'INFP', 'INFP', 'INFP'], dtype=object)

In [122]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.38


# Identity Matrix

In [135]:
# Create an identity matrix with the length of the vocabulary
identity_matrix = np.eye(len(feature_names))
identity_matrix

array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]])

## Predicting the Identity Matrix Using the Models

In [131]:
# Transform the identity matrix using the same CountVectorizer
identity_matrix_transformed = vectorizer.transform([' '.join(feature_names)])

In [133]:
model.predict(identity_matrix_transformed)

array(['INFJ'], dtype=object)

In [134]:
identity_matrix_transformed

<1x145469 sparse matrix of type '<class 'numpy.int64'>'
	with 145469 stored elements in Compressed Sparse Row format>