In [None]:
import pandas as pd
import numpy as np

In [None]:
from itertools import chain
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


def custom_tokenizer(para):
    words = list()
    for sent in para.split(' . '):
        words.append(sent.split())
    return list(chain(*words))


def count_vectorizer(sentences, params={}):
    default_params = {'strip_accents': None, 
                    'lowercase': True,
                    'preprocessor': None, 
                    'tokenizer': None, 
                    'stop_words': None, 
                    'ngram_range': (1, 1), 
                    'analyzer': 'word', 
                    'max_df': 1.0, 
                    'min_df': 1, 
                    'max_features': None, 
                    'vocabulary': None}
    default_params.update(params)
    
    cv = CountVectorizer(sentences, **default_params)
    cv_trans_sent = cv.fit_transform(sentences)
    
    return cv, cv_trans_sent


def tfidf_vectorizer(sentences, params={}):
    default_params = {'smooth_idf': True,
                    'use_idf': True,
                    'strip_accents': None, 
                    'lowercase': True,
                    'preprocessor': None, 
                    'tokenizer': None, 
                    'stop_words': None, 
                    'ngram_range': (1, 1), 
                    'analyzer': 'word', 
                    'max_df': 1.0, 
                    'min_df': 1, 
                    'max_features': None, 
                    'vocabulary': None}
    default_params.update(params)
    
    tf = TfidfVectorizer(**default_params)
    tf_trans_sent = tf.fit_transform(sentences)
    
    return tf, tf_trans_sent


def top_words_tfidf(tf_obj, doc, topn=20):  
    # Function code credits: https://kavita-ganesan.com/extracting-keywords-from-text-tfidf/
    tf_idf_vector = tf_obj.transform(doc)
    tuples = zip(tf_idf_vector.tocoo().col, tf_idf_vector.tocoo().data)
    sorted_items = sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
    
    feature_names = tf_obj.get_feature_names()
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]
    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #create a tuples of feature,score
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]

    return results

In [None]:
df = pd.read_excel("../Data/Market_Manipulation/2_enron_data_with_stop.xlsx", nrows=200)
df.dropna(inplace=True)
df.head()

In [None]:
cv_obj, cv_sent = count_vectorizer(df['processed_text'].tolist(), {'tokenizer': custom_tokenizer})

In [None]:
tf_obj, tf_sent = tfidf_vectorizer(df['processed_text'].tolist(), {'tokenizer': custom_tokenizer})

In [None]:
doc=["this is to test the native vectorization technique and some random words here and there"]
top_words_tfidf(tf_obj, doc, 4)