
## COUNTS & VECTORIZATION

In [1]:
# importing dependencies here
import numpy as np
import pandas as pd
import os

# feature engineering
import re
import nltk

# vectorization
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# performance check
import time

# code formatter
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# reading the clean_data_2 file
personality_data = pd.read_csv(os.path.join("..", "data", "clean_data_2.csv"))

<IPython.core.display.Javascript object>

In [5]:
# lookign at the top 5 rows of the dataset
personality_data.head()

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,neu_sentiment,tag_posts
0,INFJ,0,0,0,1,'When asked of the things you wish you did ear...,when asked the things you wish you did earli...,0.9999,0.346041,0.128155,0.567954,"[""'When asked of the things you wish you did e..."
1,INFJ,0,0,0,1,'I love both and they are equally important to...,love both and they are equally important ...,0.99995,0.532258,0.143689,0.373391,"[""'I love both and they are equally important ..."
2,INFJ,0,0,0,1,Really? You think implying that everyone who i...,really you think implying that everyone who i...,0.9973,0.312317,0.316505,0.462089,"[""Really? You think implying that everyone who..."
3,ENFJ,1,0,0,1,'Love is a crazy thing. Se is our best form ...,love crazy thing our best form commun...,0.99985,0.394428,0.264078,0.422031,"[""'Love is a crazy thing. Se is our best for..."
4,INTP,0,0,1,0,'I am a physics undergrad with a computation e...,physics undergrad with computation emphas...,0.99985,0.397361,0.198058,0.46495,"[""'I am a physics undergrad with a computation..."


In [6]:
# checking the number of rows and columns
personality_data.shape

(8588, 12)

### Feature Engineering - III

#### COUNTING

#### Question/Exclamation/Colon/Emoji Count

In [3]:
def unique_words(s):
    unique = set(s.split(" "))
    return len(unique) / 50


def emojis(post):
    # does not include emojis made purely from symbols, only :word:
    emoji_count = 0
    words = post.split()
    for e in words:
        if "http" not in e:
            if e.count(":") == 2:
                emoji_count += 1
    return emoji_count / 50


def colons(post):
    # Includes colons used in emojis
    colon_count = 0
    words = post.split()
    for e in words:
        if "http" not in e:
            colon_count += e.count(":")
    return colon_count / 50

<IPython.core.display.Javascript object>

In [8]:
personality_data["qm"] = personality_data["posts"].apply(lambda s: s.count("?") / 50)
personality_data["em"] = personality_data["posts"].apply(lambda s: s.count("!") / 50)
personality_data["colons"] = personality_data["posts"].apply(colons)
personality_data["emojis"] = personality_data["posts"].apply(emojis)

#### Word Count

In [9]:
personality_data["word_count"] = personality_data["posts"].apply(
    lambda s: (s.count(" ") + 1) / 50
)
personality_data["unique_words"] = personality_data["posts"].apply(unique_words)

#### Word Stats

* CAUTION - This will take Long !!

In [10]:
# stats

t = time.time()

# personality_data["avg_word_ct"] = personality_data["word_count"].apply(lambda s: s / 50)

personality_data["post_length_var"] = personality_data["posts"].apply(
    lambda x: np.var([len(post.split()) for post in x.split("|||")])
)

print(f"Time Taken: {time.time() - t}")

Time Taken: 0.9359219074249268


#### Upper Case Count

In [11]:
personality_data["upper"] = personality_data["posts"].apply(
    lambda x: len([x for x in x.split() if x.isupper()]) / 50
)

#### Link Count

In [12]:
personality_data["link_count"] = personality_data["posts"].apply(
    lambda s: s.count("http") / 50
)

#### Ellipses Count

In [13]:
ellipses_count = [
    len(re.findall(r"\.\.\.\ ", posts)) / 50 for posts in personality_data["posts"]
]
personality_data["ellipses"] = ellipses_count

#### Image Count

In [14]:
personality_data["img_count"] = [
    len(re.findall(r"(\.jpg)|(\.jpeg)|(\.gif)|(\.png)", post)) / 50
    for post in personality_data["posts"]
]

In [15]:
personality_data.head(2)

Unnamed: 0,type,is_Extrovert,is_Sensing,is_Thinking,is_Judging,posts,clean_posts,compound_sentiment,pos_sentiment,neg_sentiment,...,em,colons,emojis,word_count,unique_words,post_length_var,upper,link_count,ellipses,img_count
0,INFJ,0,0,0,1,'When asked of the things you wish you did ear...,when asked the things you wish you did earli...,0.9999,0.346041,0.128155,...,0.22,0.32,0.08,30.98,14.92,78.414931,1.46,0.04,0.62,0.0
1,INFJ,0,0,0,1,'I love both and they are equally important to...,love both and they are equally important ...,0.99995,0.532258,0.143689,...,0.36,0.14,0.0,28.58,12.72,160.7444,1.62,0.02,0.04,0.0


In [16]:
# checking the data types to make sure they still look good
personality_data.dtypes

type                   object
is_Extrovert            int64
is_Sensing              int64
is_Thinking             int64
is_Judging              int64
posts                  object
clean_posts            object
compound_sentiment    float64
pos_sentiment         float64
neg_sentiment         float64
neu_sentiment         float64
tag_posts              object
qm                    float64
em                    float64
colons                float64
emojis                float64
word_count            float64
unique_words          float64
post_length_var       float64
upper                 float64
link_count            float64
ellipses              float64
img_count             float64
dtype: object

In [17]:
# checking for null values again
personality_data.isnull().sum()

type                  0
is_Extrovert          0
is_Sensing            0
is_Thinking           0
is_Judging            0
posts                 0
clean_posts           0
compound_sentiment    0
pos_sentiment         0
neg_sentiment         0
neu_sentiment         0
tag_posts             0
qm                    0
em                    0
colons                0
emojis                0
word_count            0
unique_words          0
post_length_var       0
upper                 0
link_count            0
ellipses              0
img_count             0
dtype: int64

In [4]:
# Saving the data with counts
personality_data.to_csv(os.path.join("..", "data", "clean_data_3.csv"), index=False)

<IPython.core.display.Javascript object>

### Vectorize - For analysis purpose only. For model, the vectorization will be added to the pipeline.

In [5]:
from sklearn.pipeline import Pipeline


def get_column_names_from_ColumnTransformer(column_transformer):    
    col_name = []
    for transformer_in_columns in column_transformer.transformers_[:-1]:#the last transformer is ColumnTransformer's 'remainder'
        raw_col_name = transformer_in_columns[2]
        if isinstance(transformer_in_columns[1],Pipeline): 
            transformer = transformer_in_columns[1].steps[-1][1]
        else:
            transformer = transformer_in_columns[1]
        try:
            names = transformer.get_feature_names()
        except AttributeError: # if no 'get_feature_names' function, use raw column name
            names = raw_col_name
        if isinstance(names,np.ndarray): # eg.
            col_name += names.tolist()
        elif isinstance(names,list):
            col_name += names    
        elif isinstance(names,str):
            col_name.append(names)
    return col_name

<IPython.core.display.Javascript object>

In [6]:
# Using TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df=25, max_df=0.8)
tfidf_words = tfidf_vectorizer.fit_transform(personality_data["clean_posts"])
tfidf_vectorized_data = pd.DataFrame(
    data=tfidf_words.toarray(), columns=tfidf_vectorizer.get_feature_names_out()
)

KeyError: 'clean_posts'

<IPython.core.display.Javascript object>

In [26]:
tfidf_vectorized_data.head()

Unnamed: 0,aback,abandon,abandoned,abandoning,abandonment,abbey,abhor,abide,abilities,ability,...,zero,zodiac,zombie,zombies,zone,zoned,zones,zoning,zoo,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Saving the TF-IDF vectorized data
tfidf_vectorized_data.to_csv(os.path.join("..", "data", "tfidf_vectorized_data.csv"), index=False)

NameError: name 'tfidf_vectorized_data' is not defined

<IPython.core.display.Javascript object>

In [9]:
# Using CountVectorizer

count_vectorizer = CountVectorizer(decode_error="ignore", min_df=25, max_df=0.8,)

count_words = count_vectorizer.fit_transform(personality_data["clean_posts"])
count_vectorized_data = pd.DataFrame(
    data=count_words.toarray(), columns=count_vectorizer.get_feature_names_out()
)

KeyError: 'clean_posts'

<IPython.core.display.Javascript object>

In [30]:
count_vectorized_data.head()

Unnamed: 0,aback,abandon,abandoned,abandoning,abandonment,abbey,abhor,abide,abilities,ability,...,zero,zodiac,zombie,zombies,zone,zoned,zones,zoning,zoo,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Saving the Count vectorized data
count_vectorized_data.to_csv(os.path.join("..", "data", "count_vectorized_data.csv"), index=False)

NameError: name 'count_vectorized_data' is not defined

<IPython.core.display.Javascript object>