<a href="https://colab.research.google.com/github/onertartan/NLP-Reuters-CountVectorizer/blob/main/Reuters_CountVectorizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


### Reuters-21578 benchmark corpus <br>
Abstract: This is a collection of documents that appeared on Reuters newswire in 1987. The documents were assembled and indexed with categories.
https://www.kaggle.com/datasets/nltkdata/reuters


### 1.GET and EXPLORE DATA

In [1]:
# Check data directory
import os
reuters_folder = "drive/MyDrive/Data/reuters"
os.listdir(reuters_folder)

['README', 'cats.txt', 'stopwords', 'test', 'training']

In [2]:
#  Number of files in training and test folders
training_folder = os.listdir(os.path.join(reuters_folder,"training"))
test_folder =os.listdir(os.path.join(reuters_folder,"test"))
print("Number of files in training folder:",len(training_folder) )
print("Number of files in test folder:",len(test_folder) )

Number of files in training folder: 7769
Number of files in test folder: 3019


In [3]:
# Contents of training folder
os.listdir(os.path.join(reuters_folder,"training"))[:5]

['8213', '7901', '722', '9515', '8986']

In [4]:
# Contents of cats.txt file. 
# Rows have the format   label_type/file_id category1 category2 ...
with open(os.path.join(reuters_folder,"cats.txt") ) as f:
    for i in range(5):
        print(f.readline(),end="")

test/14826 trade
test/14828 grain
test/14829 nat-gas crude
test/14832 rubber tin sugar corn rice grain trade
test/14833 palm-oil veg-oil


### 2-PREPROCESS DATA

#### 2.1Read and parse **cats.txt** as dataframe 

In [5]:
import pandas as pd
df = pd.read_csv(os.path.join(reuters_folder,"cats.txt"),header=None,sep="/")
df.head()

Unnamed: 0,0,1
0,test,14826 trade
1,test,14828 grain
2,test,14829 nat-gas crude
3,test,14832 rubber tin sugar corn rice grain trade
4,test,14833 palm-oil veg-oil


In [6]:
# Split file_id and categories, rename  columns
df["file_id"] = df[1].apply(lambda x: x.strip().split()[0] )
df["categories"] = df[1].apply(lambda x: x.strip().split()[1:])
df.drop(1,axis=1,inplace=True)
df.head()

Unnamed: 0,0,file_id,categories
0,test,14826,[trade]
1,test,14828,[grain]
2,test,14829,"[nat-gas, crude]"
3,test,14832,"[rubber, tin, sugar, corn, rice, grain, trade]"
4,test,14833,"[palm-oil, veg-oil]"


In [7]:

df.rename(columns={0: "label_type"},inplace = True)
df.head()


Unnamed: 0,label_type,file_id,categories
0,test,14826,[trade]
1,test,14828,[grain]
2,test,14829,"[nat-gas, crude]"
3,test,14832,"[rubber, tin, sugar, corn, rice, grain, trade]"
4,test,14833,"[palm-oil, veg-oil]"


In [8]:
df.tail()

Unnamed: 0,label_type,file_id,categories
10783,training,14779,"[money-fx, dlr]"
10784,training,14783,[rubber]
10785,training,14785,[money-fx]
10786,training,14805,[copper]
10787,training,14818,[ship]


In [9]:
# Encode categories as integers
#from sklearn.preprocessing import LabelEncoder
#label_encoder = LabelEncoder()
categories_set = set()
for categories in df["categories"]:
  categories_set.update(categories)
print(categories_set)
print("Number of labels:",len(categories_set))

{'ipi', 'yen', 'heat', 'copra-cake', 'money-fx', 'coconut-oil', 'cpi', 'grain', 'cotton', 'earn', 'rye', 'nkr', 'cpu', 'housing', 'sunseed', 'lumber', 'jet', 'gold', 'livestock', 'rubber', 'income', 'strategic-metal', 'sorghum', 'acq', 'zinc', 'crude', 'tin', 'nat-gas', 'trade', 'meal-feed', 'castor-oil', 'rapeseed', 'money-supply', 'silver', 'dfl', 'alum', 'fuel', 'nickel', 'palm-oil', 'lin-oil', 'soybean', 'palladium', 'instal-debt', 'corn', 'copper', 'bop', 'nzdlr', 'sun-oil', 'wheat', 'dmk', 'jobs', 'naphtha', 'orange', 'groundnut', 'pet-chem', 'iron-steel', 'gas', 'hog', 'platinum', 'cocoa', 'sugar', 'coffee', 'cotton-oil', 'barley', 'ship', 'oilseed', 'l-cattle', 'gnp', 'palmkernel', 'rape-oil', 'interest', 'reserves', 'veg-oil', 'lei', 'potato', 'carcass', 'retail', 'sun-meal', 'dlr', 'groundnut-oil', 'tea', 'rand', 'soy-oil', 'oat', 'wpi', 'soy-meal', 'lead', 'propane', 'coconut', 'rice'}
Number of labels: 90


#### 2.2 Create a column for text files and read texts according to file_id column

In [10]:
def read_text(label_type,file_id):
    file_path = os.path.join(reuters_folder,label_type,file_id)
    with open(file_path,encoding="latin-1") as f:
        return f.read()
df["text"] = df.apply(lambda row: read_text(row["label_type"],row["file_id"]),axis=1 )
df.head()


Unnamed: 0,label_type,file_id,categories,text
0,test,14826,[trade],ASIAN EXPORTERS FEAR DAMAGE FROM U.S.-JAPAN RI...
1,test,14828,[grain],CHINA DAILY SAYS VERMIN EAT 7-12 PCT GRAIN STO...
2,test,14829,"[nat-gas, crude]",JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWA...
3,test,14832,"[rubber, tin, sugar, corn, rice, grain, trade]",THAI TRADE DEFICIT WIDENS IN FIRST QUARTER\n ...
4,test,14833,"[palm-oil, veg-oil]",INDONESIA SEES CPO PRICE RISING SHARPLY\n Ind...


#### 2.3 Fit the count vectorizer to texts

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer()
count_vectorizer.fit(df["text"])

CountVectorizer()

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
binarizer = MultiLabelBinarizer()
binarizer.fit(df["categories"])
binarizer.classes_

array(['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa',
       'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn',
       'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk',
       'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut',
       'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt',
       'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead',
       'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx',
       'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr',
       'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel',
       'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil',
       'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship',
       'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean',
       'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed',
       'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', '

### 3. SPLIT DATA and CREATE DATA GENERATORS

In [13]:
df_train = df[df["label_type"]=="training"]
df_test= df[df["label_type"]=="test"]


In [14]:
from sklearn.model_selection import train_test_split
df_val, df_test = train_test_split(df_test, test_size=0.5)
df_train.shape, df_test.shape, df_val.shape

((7769, 4), (1510, 4), (1509, 4))

In [17]:
import numpy as np
def data_generator(df,  batch_size, steps_per_epoch,shuffle):
    indices =list(range(steps_per_epoch ))
    while True:
       if shuffle:
           np.random.shuffle(indices)
       for step in range(steps_per_epoch):
            start_batch = indices[step]
            x= count_vectorizer.transform(df.iloc[start_batch*batch_size:(start_batch+1)*batch_size]["text"]).todense()  
            y = binarizer.transform(df.iloc[start_batch*batch_size:(start_batch+1)*batch_size]["categories"].to_numpy())
            yield x,y

In [18]:
NUM_EPOCHS=5
BATCH_SIZE= 32
STEPS_PER_EPOCH = len(df_train) // BATCH_SIZE
VALIDATION_STEPS= len(df_val) // BATCH_SIZE
STEPS = len(df_test)//BATCH_SIZE

train_gen =data_generator(df_train, BATCH_SIZE, STEPS_PER_EPOCH, shuffle=True)
val_gen =data_generator(df_val, BATCH_SIZE, VALIDATION_STEPS, shuffle = False)
test_gen =data_generator(df_test, BATCH_SIZE, VALIDATION_STEPS, shuffle = False)

### 4.CREATE, COMPILE and FIT MODEL

In [19]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
model = Sequential([Dense(64,activation="relu"),
                   Dense(64,activation="relu"),
                   Dense(len(categories_set)) ])

model.compile(optimizer=tf.keras.optimizers.Adam(), loss = tf.keras.losses.BinaryCrossentropy(from_logits=True),metrics = ["binary_accuracy"])

history = model.fit(train_gen, epochs= NUM_EPOCHS, steps_per_epoch =STEPS_PER_EPOCH,validation_data= val_gen,validation_steps =VALIDATION_STEPS, verbose=2  )

Epoch 1/5
242/242 - 6s - loss: 0.1201 - binary_accuracy: 0.9868 - val_loss: 0.0364 - val_binary_accuracy: 0.9931 - 6s/epoch - 24ms/step
Epoch 2/5
242/242 - 5s - loss: 0.0224 - binary_accuracy: 0.9939 - val_loss: 0.0225 - val_binary_accuracy: 0.9946 - 5s/epoch - 21ms/step
Epoch 3/5
242/242 - 5s - loss: 0.0120 - binary_accuracy: 0.9960 - val_loss: 0.0186 - val_binary_accuracy: 0.9953 - 5s/epoch - 20ms/step
Epoch 4/5
242/242 - 5s - loss: 0.0072 - binary_accuracy: 0.9974 - val_loss: 0.0171 - val_binary_accuracy: 0.9957 - 5s/epoch - 20ms/step
Epoch 5/5
242/242 - 5s - loss: 0.0048 - binary_accuracy: 0.9982 - val_loss: 0.0186 - val_binary_accuracy: 0.9958 - 5s/epoch - 21ms/step


### 5.EVALUATE and TEST WITH SAMPLE TEXT

In [21]:
#Evaluate the model using test_gen
model.evaluate(test_gen,steps=STEPS)



[0.02354593388736248, 0.995161235332489]

In [74]:
#Test with sample texts
random_index= np.random.randint(len(df_test))
x_sample_text= df_test.iloc[random_index:random_index+1]["text"]
x_sample=count_vectorizer.transform(x_sample_text).todense()
x_sample.shape

(1, 30916)

In [75]:
# Predict 
threshold = .5 # can be tuned according to confusion matrix
y_predict = model(x_sample)
y_predict = tf.keras.activations.sigmoid(y_predict)
y_predict=tf.where(y_predict>threshold, 1,0)
y_predict # tensor with shape (num_samples, num_categories)

<tf.Tensor: shape=(1, 90), dtype=int32, numpy=
array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0]], dtype=int32)>

In [76]:
# Convert indices to label names
binarizer.inverse_transform(y_predict.numpy())

[('bop', 'trade')]

In [78]:
#Check true label names
df_test.iloc[random_index:random_index+1]["categories"]

2919    [trade, bop]
Name: categories, dtype: object