In [1]:
# import
import re
import json
import numpy as np
import pandas as pd

In [2]:
# import dataset
# dataset from https://www.kaggle.com/datasnaek/mbti-type
mbti = pd.read_csv('mbti.csv')

In [3]:
def tokenize(text):
    """Returns a list of words that make up the text.
    
    Note: for simplicity, lowercase everything.
    Requirement: Use regular expressions to satisfy this function
    
    Params: {text: String}
    Returns: List
    """
    # YOUR CODE HERE
    # fix regular expression to remove links
    return re.findall('[a-z]+', text.lower())

In [4]:
def tokenize_mbti(df):
    tokenized = []
    for idx, row in df.iterrows():
        t = row['type']
        text = tokenize(row['posts'])
        tokenized.append((t, text))
    return tokenized

In [5]:
tokenized = tokenize_mbti(mbti)

In [6]:
# create dictionary with mbti and tokens
mbti_dict = {}
for (a, b) in tokenized:
    if a not in mbti_dict:
        mbti_dict[a] = b
    else:
        mbti_dict[a] += b

In [7]:
# dictionary with word and mbti counts
word_dict = {}
for key in mbti_dict:
    word_set = set(mbti_dict.get(key))
    for word in word_set:
        if word not in word_dict:
            word_dict[word] = 1
        else:
            word_dict[word] += 1

In [8]:
import math
def compute_idf(word_dict, n_docs, min_df=1, max_df_ratio=1):
  
    # YOUR CODE HERE
    
    # initialization
    idf_dict = {}
    
    # compute IDF
    for word in word_dict:
        word_df = word_dict[word] 
        word_df_ratio = word_df / n_docs
        if word_df >= min_df and word_df_ratio < max_df_ratio:
            # remove 1+ word_df in denominator (smoothing not needed)
            idf_dict[word] = math.log2(n_docs / (word_df))
            
    return idf_dict

idf_dict = compute_idf(word_dict, 16)

In [9]:
def output_words_to_analyze(input_word_counts):
    """Returns a list of words to analyze in alphabetically sorted order
        Params: {input_word_counts: Dict}
        Returns: List
    """
    # YOUR CODE HERE
    analyze_list = []
    for word in input_word_counts:
        if input_word_counts.get(word) > 1:
            analyze_list.append(word)
    return sorted(analyze_list)

words_to_analyze = output_words_to_analyze(word_dict)

In [10]:
from collections import Counter

tf_matrix = np.zeros((len(mbti_dict), len(words_to_analyze)))
mbti_keys = list(mbti_dict.keys())

for mbti in mbti_dict:
    c = Counter(mbti_dict[mbti])
    for index, word in enumerate(words_to_analyze):
        count = c[word]
        tf_matrix[mbti_keys.index(mbti)][index] = count

In [11]:
def valid_query(input_query, words_to_analyze=words_to_analyze):
    tokenize_query = tokenize(input_query.lower())
    for token in tokenize_query:
        if token not in words_to_analyze:
            tokenize_query.remove(token)
    return tokenize_query

In [30]:
def compute_doc_norms(index, idf, n_docs):
    """
    Returns a numpy array of the norms of each MBTI.

    Params: {index: Dictionary,
           idf: Dictionary,
           n_docs: Integer
          }

    Returns: Numpy Array
    """
    # initialization
    norms = np.zeros(n_docs)

    # compute the norm of each MBTI and add to numpy array
    for w, l in index.items():
        if w in idf:
            for mbti, tf in l:
                calculation = math.pow((tf * idf[w]), 2)
                norms[mbti] += calculation
  
    return np.sqrt(norms)

In [31]:
txt = np.savetxt('doc_norms.txt', doc_norms)

In [32]:
json.dump(idf_dict, open('idf.json', 'w'))

In [33]:
def compute_idf(inv_idx, n_docs, min_df=1, max_df_ratio=1.0):
    """
    Returns a dictionary with valid words as keys and their IDFs as values.

    Params: {inv_idx: Dictionary,
           n_docs: Integer,
           min_df: Integer,
           max_df_ratio: Float
          }

    Returns: Dictionary
    """
    # initialization
    idf_dict = {}

    # compute IDF for each word in inverted index
    for word in inv_idx:
        word_df = len(inv_idx[word]) 
        word_df_ratio = word_df / n_docs
        if word_df >= min_df and word_df_ratio < max_df_ratio:
            idf_dict[word] = math.log2(n_docs / (1 + word_df))

    return idf_dict

In [34]:
def mbti_tokenized(tokenized):
    """
    Returns a dictionary with MBTI keys and tokenized text values.

    Params: {tokenized: List of Tuples}

    Returns: Dictionary
    """
    # initialization
    mbti_dict = {}

    # creates dictionary with MBTIs as keys with the tokenized text as values
    for (a, b) in tokenized:
        if a not in mbti_dict:
            mbti_dict[a] = b
        else:
            mbti_dict[a] += b

    return mbti_dict

In [35]:
mbti = pd.read_csv('mbti.csv')
tokenized = tokenize_mbti(mbti)
mbti_dict = mbti_tokenized(tokenized)
inv_idx = build_inverted_index(mbti_dict)
idf = compute_idf(inv_idx, 16)
doc_norms = compute_doc_norms(inv_idx, idf, 16)

In [36]:
json.dump(inv_idx, open('inv_idx.json', 'w'))