In [None]:


# mount your Google Drive, so that you can read data from it.
# Note: it needs your authorization.
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import random
from tqdm import tqdm
import pandas as pd

**Utility Functions**

In [None]:
import string
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import *

# stemming tool from nltk
stemmer = PorterStemmer()
# a mapping dictionary that help remove punctuations
remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)

def get_tokens(text):
    # turn document into lowercase
    lowers = text.lower()
    # remove punctuation
    no_punctuation = lowers.translate(remove_punctuation_map)
    # tokenize document
    tokens = nltk.word_tokenize(no_punctuation)
    # stop words
    filtered = [w for w in tokens if not w in stopwords.words("english")]
    # stemming process
    stemmed = []
    for item in filtered:
        stemmed.append(stemmer.stem(item))

    return stemmed


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import numpy as np


def get_dict(fpath):
    dictionary = {}


    with open(fpath, "r") as f:
        for i, word in enumerate(f):
            dictionary[word.strip()] = i

    return dictionary


def get_doc_tf(word_set, dictionary):
    n_words = len(dictionary)
    tf_vec = np.zeros(n_words)

    max_cnt = 0
    for word in word_set:
        idx = dictionary[word]
        tf_vec[idx] += 1.0

        if tf_vec[idx] > max_cnt:
            max_cnt = tf_vec[idx]

    return tf_vec / max_cnt



def get_tf_idf(tf_dict, df_vec, n_doc, n_words):

    tf_idf_mtx = np.zeros((n_doc, n_words))
    idf = np.log(n_doc / df_vec)

    for doc_idx, tf_vec in tf_dict.items():
        tf_idf = tf_dict[doc_idx]*idf

        tf_idf_mtx[doc_idx, :] = tf_idf

    return tf_idf_mtx


def write(d, fpath):

    with open(fpath, "w") as f:

        for k, v in d.items():

            f.write(f"{k}\n")


def filter_top_k(counter_sorted, limit):
    top_k = {}

    for i, k in enumerate(counter_sorted.keys()):
        if i == limit:
            break
        top_k[k] = counter_sorted[k]

    return top_k

**Compute TF-IDF Matrix**

In [None]:
def tfidf_main(fpath, dictionary):


    n_words = len(dictionary)
    tf = {}
    doc_freq = np.zeros(n_words)

    with open(fpath, 'r') as f:

        lines = f.readlines()
        n_doc = len(lines) - 1

        for i, line in tqdm(enumerate(lines), total=n_doc+1):
            if i == 0:
                continue

            doc_idx = i - 1

            id, txt, cat = line.split(",")
            cat = cat.strip()
            tokens = get_tokens(txt)

            filtered = []
            filtered_unique = set()
            for word in tokens:
                if word in dictionary:
                    filtered.append(word)
                    filtered_unique.add(word)

            # get term frequency
            tf_vec = get_doc_tf(filtered, dictionary)
            tf[doc_idx] = tf_vec

            # get doc frequency:
            for word in filtered_unique:
                idx = dictionary[word]
                doc_freq[idx] += 1


    tfidf_mtx = get_tf_idf(tf, doc_freq, n_doc, n_words)


    return tfidf_mtx

In [None]:
dictionary = get_dict("/content/drive/My Drive/Colab Notebooks/dictionary.txt")
tfidf = tfidf_main("/content/drive/My Drive/Colab Notebooks/news-train.csv", dictionary)
np.savetxt("/content/drive/My Drive/Colab Notebooks/tfidf.txt", tfidf,  fmt='%.4f', delimiter=",")

100%|██████████| 1491/1491 [01:08<00:00, 21.83it/s]


In [None]:
tfidf

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.36351873, ..., 0.        , 0.        ,
        0.        ],
       [0.12511637, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.2796298 , ..., 0.20404393, 0.        ,
        0.        ],
       [0.2144852 , 0.36719042, 0.        , ..., 0.        , 0.        ,
        0.        ]])

**Word Frequencies**

In [None]:
def frequency_main(limit, fpath, dictionary):



    with open(fpath, 'r') as f:

        lines = f.readlines()
        n_doc = len(lines) - 1

        stratifed_cntr = {
                        "sport": {},
                        "business": {},
                        "politics": {},
                        "entertainment": {},
                        "tech": {}
                    }


        for i, line in tqdm(enumerate(lines), total=n_doc + 1):
            if i == 0:
                continue

            id, txt, cat = line.split(",")
            cat = cat.strip()
            tokens = get_tokens(txt)

            for t in tokens:
                if t not in dictionary:
                    continue

                if t not in stratifed_cntr[cat]:
                    stratifed_cntr[cat][t] = 0

                stratifed_cntr[cat][t] += 1

        stratifed_sorted = {}
        for cat, cnts in stratifed_cntr.items():
            stratifed_sorted[cat] = {k: v for k, v in sorted(cnts.items(), key=lambda item: item[1], reverse=True)}


        stratified_output = {}
        for cat, cnts in stratifed_sorted.items():
            stratified_output[cat] = filter_top_k(cnts, limit)

    return stratified_output

In [None]:
counts = frequency_main(limit=3, fpath="/content/drive/My Drive/Colab Notebooks/news-train.csv", dictionary=dictionary)

100%|██████████| 1491/1491 [01:14<00:00, 20.15it/s]


In [None]:
counts

{'business': {'said': 1100, 'us': 511, 'year': 574},
 'entertainment': {'best': 404, 'film': 706, 'said': 594},
 'politics': {'mr': 1100, 'said': 1445, 'would': 710},
 'sport': {'game': 482, 'said': 635, 'win': 419},
 'tech': {'peopl': 646, 'said': 1064, 'use': 662}}

**Average TFIDF Scores by Category**

In [None]:
def mean_tfidf_main(trn_fpath, tfidf_fpath, dictionary, k):

    idx_to_word = {}
    for key, val in dictionary.items():
        idx_to_word[val] = key

    with open(trn_fpath, 'r') as f:

        lines = f.readlines()
        n_doc = len(lines) - 1
        cats = np.zeros((n_doc, 1), dtype=object)

        for i, line in tqdm(enumerate(lines), total=n_doc + 1):
            if i == 0:
                continue

            doc_idx = i - 1
            id, txt, cat = line.split(",")
            cat = cat.strip()

            cats[doc_idx, 0] = cat

        tfidf = np.loadtxt(tfidf_fpath, delimiter=",")

        df = pd.DataFrame(np.concatenate([cats, tfidf], axis=1))

        groups = df.groupby(0)

        output = {}

        for cat, chunk in groups:

            mean = chunk.values[:, 1:].mean(axis=0)

            word_idx = np.argsort(mean)
            s = np.sort(mean)
            top_k = word_idx[-k:]

            output[cat] = {}
            for idx in top_k:
                word = idx_to_word[idx]
                score = mean[idx]
                #record = {"word": word, "score": score}
                output[cat][word] = score


    return output

In [None]:
avg_tfidf = mean_tfidf_main("/content/drive/My Drive/Colab Notebooks/news-train.csv", "/content/drive/My Drive/Colab Notebooks/tfidf.txt", dictionary, k=3)

100%|██████████| 1491/1491 [00:00<00:00, 340189.70it/s]


In [None]:
avg_tfidf

{'business': {'bank': 0.27173065476190483,
  'compani': 0.26359047619047643,
  'firm': 0.28594910714285693},
 'entertainment': {'award': 0.3742216117216118,
  'film': 0.6771014652014654,
  'star': 0.37003956043956027},
 'politics': {'elect': 0.42791788321167834,
  'labour': 0.44298394160583926,
  'mr': 0.43852372262773653},
 'sport': {'england': 0.2966031791907518,
  'game': 0.34770780346820834,
  'win': 0.31158150289017345},
 'tech': {'mobil': 0.3374674329501916,
  'softwar': 0.31584597701149414,
  'technolog': 0.3140869731800769}}