In [5]:
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize
import math
import nltk
nltk.download('punkt')
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from cudf import DataFrame
import nvstrings, nvcategory


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
%%time

def count_words(sent):
    count = 0
    words = word_tokenize(sent)
    for word in words:
        count += 1
    return count


def get_doc(corpus):
    doc_info = []
    i = 0
    for i in range(0,len(corpus)):      
        count = count_words(corpus[i])
        temp = {'doc_id':i, 'doc_length' : count}
        doc_info.append(temp)
        i += 1
    return doc_info


def create_freq_dict(sents):
    i = 0
    freqDict_list = []
    for sent in sents:
        i += 1
        freq_dict = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            if word in freq_dict:
                freq_dict[word] += 1
            else:
                freq_dict[word] = 1
            temp = {'doc_id': i, 'freq_dict': freq_dict}
        freqDict_list.append(temp)
    return freqDict_list


def computeTF(doc_info, freqDict_list):
    """
    tf = (frequency of the term in the doc/total number of terms in the doc)
    """
    TF_scores = []
    for tempDict in freqDict_list:
        id = tempDict['doc_id']
        for k in tempDict['freq_dict']:
            temp = {'doc_id': id,
                    'TF_score': tempDict['freq_dict'][k]/doc_info[id-1]['doc_length'],
                    'key': k}
            TF_scores.append(temp)
    return TF_scores

def computeIDF(doc_info, freqDict_list):
    """
    idf = ln(total number of docs/number of docs with term in it)
    """
    IDF_scores = []
    counter = 0
    for dict in freqDict_list:
        counter += 1
        for k in dict['freq_dict'].keys():
            count = sum([k in tempDict['freq_dict'] for tempDict in freqDict_list])
            temp = {'doc_id': counter, 'IDF_score': math.log(len(doc_info)/count), 'key' : k}
            
            IDF_scores.append(temp)
            
    return IDF_scores


def computeTFIDF(TF_scores, IDF_scores):
    TFIDF_scores = []
    for j in IDF_scores:
        for i in TF_scores:
            if j['key'] == i['key'] and j['doc_id'] == i['doc_id']:
                temp = {'doc_id': j['doc_id'],
                        'TFIDF_score': j['IDF_score']*i['TF_score'],
                        'key': i['key']}
                TFIDF_scores.append(temp)
    return TFIDF_scores

# 1.6M data
# text_sents_clean = list(pd.read_csv('/data/tweet_data.csv', delimiter=',', names=['note'],skiprows=1).note)

# 1k data
text_sents_clean = list(pd.read_csv('/data/note1000.csv', delimiter=',', names=['note'],skiprows=1).note)
doc_info = get_doc(text_sents_clean)
freqDict_list = create_freq_dict(text_sents_clean)
TF_scores = computeTF(doc_info, freqDict_list)
IDF_scores = computeIDF(doc_info, freqDict_list)
TFIDF_scores = computeTFIDF(TF_scores, IDF_scores)
TFIDF_scores[:5]

CPU times: user 17.5 s, sys: 23.6 ms, total: 17.5 s
Wall time: 17.5 s


[{'doc_id': 1, 'TFIDF_score': 0.0, 'key': 'the'},
 {'doc_id': 1, 'TFIDF_score': 0.09902102579427789, 'key': 'man'},
 {'doc_id': 1, 'TFIDF_score': 0.09902102579427789, 'key': 'went'},
 {'doc_id': 1, 'TFIDF_score': 0.09902102579427789, 'key': 'out'},
 {'doc_id': 1, 'TFIDF_score': 0.09902102579427789, 'key': 'for'}]