## TF-IDF on 20news dataset

### Import data

Import modules:

In [1]:
import os
import nltk
import numpy as np
import math
import pandas as pd
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
nltk.download('punkt')
from num2words import num2words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\marin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Index dataset:

In [2]:
rootdir = '20news-bydate-train'
dataset = []
# index dataset
for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        dataset.append((subdir+'\\'+file, file))

### Preprocess data

#### Preprocessing functions

In [3]:
def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text


def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = stemming(data) #needed again as we need to stem the words
    data = remove_punctuation(data) #needed again as num2word is giving few hypens and commas fourty-one
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

Process text

In [4]:
# List of all documents with all words
processed_text = []

N = len (dataset)

for i in dataset[:N]:
    file = open(i[0], 'r', encoding="utf8", errors='ignore')
    text = file.read().strip()
    file.close()

    processed_text.append(word_tokenize(str(preprocess(text))))










































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































































Unique words

In [5]:
uniqueWords = {}
for i in range(N):
    uniqueWords = set(uniqueWords).union(set(processed_text[i]))

Number of words for every word in every document

In [6]:
numOfWords = {}
for i in range(N):
    numOfWords[i] = dict.fromkeys(uniqueWords, 0)
    for word in processed_text[i]:
        numOfWords[i][word] += 1

### Calculate TF

In [7]:
TF = {}
for i in range(N):
    TF[i] = {}
    for word in numOfWords[i]:
        TF[i][word] = numOfWords[i][word] / float(len(processed_text[i]))

In [8]:
len(TF)

2245

### Calculate DF

In [9]:
DF = {}
for i in range(N):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}
    
for i in DF:
    DF[i] = len(DF[i])

### Calculate IDF

In [10]:
IDF = {}
for i in DF:
    IDF[i] = math.log(N / float(DF[i]))

### Calculate TF-IDF

In [11]:
tfidf = {}
for i in range(N):
    tfidf[i] = {}
    for word, val in TF[i].items():
        tfidf[i][word] = val * IDF[word]

In [12]:
df = pd.DataFrame(tfidf)
df = df.transpose()

### Query results

In [13]:
print(df['matthew'].astype(float).nlargest(10))

877     0.110105
1523    0.108729
142     0.100145
901     0.093530
178     0.092046
944     0.086983
1818    0.082841
418     0.061228
2       0.059741
889     0.051167
Name: matthew, dtype: float64
