In [2]:
import zipfile
import pandas as pd
import numpy as np
import utils
from sklearn.metrics import f1_score
from collections import Counter

# Reading data

In [192]:
test_data = []

In [193]:
with zipfile.ZipFile("test.zip", 'r') as z:
    for file_info in z.infolist():
        file_name = file_info.filename
        parts = file_name.strip().split('/')
        if parts[0] == 'test' and parts[-1] != '':
            category = parts[1]
            file_name_number = parts[2]
            with z.open(file_name) as file:
                content = file.read().decode('utf-8', errors='replace')
                test_data.append({'content': content, 'category': category, 'file_name': file_name_number})


In [194]:
test_data = pd.DataFrame(test_data)
test_data

Unnamed: 0,content,category,file_name
0,\n\r\nTIN TRADERS' RESPONSE MUTED TO KL FUTURE...,tin,0011164
1,\n\r\nITC CONTESTS USE OF DOCUMENTS AS COURT E...,tin,0011649
2,\n\r\nSPOT TIN EASIER ON EUROPEAN FREE MARKET\...,tin,0010147
3,\n\r\nTHAI SMELTER FACES TIN CONCENTRATE SUPPL...,tin,0011710
4,\n\r\nTIN COUNCIL ALLOWED APPEAL ON USE OF DOC...,tin,0012316
...,...,...,...
4019,\n\r\nU.K. RESERVES RISE UNDERLYING 4.8 BILLIO...,reserves,0011269
4020,\n\r\nTAIWAN FOREIGN EXCHANGE RESERVES HIT REC...,reserves,0012215
4021,\n\r\n German net currency reserves rise 500 m...,reserves,0012152
4022,\n\r\nSPAIN'S FOREIGN RESERVES RISE IN FEBRUAR...,reserves,0010235


# Splitting data
I will splitting data to train, dev, and test sets, ensuring all have same distribution. 

In [7]:
# splitting occuring an error:
# ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.
# So I will duplicate the single instance
data = utils.sampling_data(data)

In [8]:
data.shape

(11420, 3)

In [9]:
X = data[['content', 'file_name']]
y = data['category']

In [10]:
X_train, X_test, y_train, y_test, train_data, test_data = utils.split_create_dfs(X, y, dist=True)

Training set size: 9136
Testing set size: 2284
Training set class distribution:
 earn          0.251861
unknown       0.160245
acq           0.144483
money-fx      0.047067
grain         0.037872
                ...   
palladium     0.000219
nzdlr         0.000219
palmkernel    0.000219
sun-meal      0.000219
naphtha       0.000219
Name: category, Length: 91, dtype: float64
Testing set class distribution:
 earn           0.252189
unknown        0.160245
acq            0.144483
money-fx       0.047285
grain          0.038091
                 ...   
l-cattle       0.000438
coconut-oil    0.000438
platinum       0.000438
propane        0.000438
cpu            0.000438
Name: category, Length: 77, dtype: float64


In [11]:
utils.display_dist_random_class(data['category'], train_data['category'], test_data['category'])

             Train      Test
earn      0.392595  0.392638
unknown   0.249787  0.249489
acq       0.225218  0.224949
money-fx  0.073366  0.073620
grain     0.059034  0.059305


I will contonue with training dataset

# Preprocessing

In [12]:
train_data

Unnamed: 0,content,file_name,category
228,\n\r\nFED DATA INDICATE NO POLICY CHANGE LIKEL...,0003194,interest
5020,\n\r\nVOLCKER URGES GREATER EXCHANGE RATE STAB...,0009437,money-fx
7126,\n SAY NEW DOW COMPONENTS AID AVERAGE\r\n\n ...,0002741,unknown
6823,"\n\r\nEGYPT REJECTS VEG OIL OFFERS, TO RETENDE...",0004459,sun-oil
6785,\n\r\nU.S. SAID PROMISED BULK OF MAIZE EXPORT ...,0008512,corn
...,...,...,...
2530,\n\r\nHOLLY SUGAR CORP <HLY> SETS REGULAR DIVI...,0001841,earn
1690,\n\r\nOREGON LUMBER COMPANY TO SELL WOOD TO IR...,0002719,lumber
6752,\n\r\nUSSR CORN BUYING MAY BE 3.5 MLN TONNES--...,0004709,corn
5849,\n\r\nMORE HEAVY RAINS IN ARGENTINE GRAIN AREA...,0002756,grain


In [13]:
# remove any escape code and convert file name to number
train_data['content'], train_data['file_name'] = utils.reorganize_data(train_data['content'], train_data['file_name'])

In [14]:
print(train_data.content[7111])
print(type(train_data.file_name[7111]))

CORP <HIA> TO REDEEM TWO BOND ISSUES NEW YORK, March 2 - Holiday Corp said it would redeem on March 11 all outstanding 9-1/2 pct first mortgage bonds due 1995 of its Holiday Inns Inc unit and all 9-1/2 pct first mortgage bonds, Series A, due 1996 of its Harrah's subsidiary. The Holiday bonds will be bought back at 101.6 pct of the bonds' principal amount plus accrued interest, or 1,038.69 dlrs per 1,000 dlr face amount. Holiday will redeem the Harrah's bonds at 104.5 pct of the principal amount plus accrued interest, or 1,079.31 dlrs per 1,000 dlr face amount.
<class 'numpy.int32'>


## Lowercasing

In [15]:
train_data['lowercasing'] = train_data['content'].apply(utils.lowercase_text)
train_data['lowercasing']

228     fed data indicate no policy change likely <aut...
5020    volcker urges greater exchange rate stability ...
7126    say new dow components aid average new york, m...
6823    egypt rejects veg oil offers, to retender lond...
6785    u.s. said promised bulk of maize export to spa...
                              ...                        
2530    holly sugar corp <hly> sets regular dividend c...
1690    oregon lumber company to sell wood to iraq por...
6752    ussr corn buying may be 3.5 mln tonnes--amstut...
5849    more heavy rains in argentine grain areas <aut...
309     fed expected to add reserves new york, march 2...
Name: lowercasing, Length: 9136, dtype: object

## Tokenize text

In [16]:
train_data['tokenized'] = train_data['lowercasing'].apply(utils.tokenize_text)

In [17]:
train_data['tokenized']

228     [fed, data, indicate, no, policy, change, like...
5020    [volcker, urges, greater, exchange, rate, stab...
7126    [say, new, dow, components, aid, average, new,...
6823    [egypt, rejects, veg, oil, offers, ,, to, rete...
6785    [u.s., said, promised, bulk, of, maize, export...
                              ...                        
2530    [holly, sugar, corp, <, hly, >, sets, regular,...
1690    [oregon, lumber, company, to, sell, wood, to, ...
6752    [ussr, corn, buying, may, be, 3.5, mln, tonnes...
5849    [more, heavy, rains, in, argentine, grain, are...
309     [fed, expected, to, add, reserves, new, york, ...
Name: tokenized, Length: 9136, dtype: object

In [18]:
train_data['clean_special'] = train_data['tokenized'].apply(utils.remove_special_chars)
train_data['clean_special']

228     [fed, data, indicate, no, policy, change, like...
5020    [volcker, urges, greater, exchange, rate, stab...
7126    [say, new, dow, components, aid, average, new,...
6823    [egypt, rejects, veg, oil, offers, , to, reten...
6785    [u.s., said, promised, bulk, of, maize, export...
                              ...                        
2530    [holly, sugar, corp, , hly, , sets, regular, d...
1690    [oregon, lumber, company, to, sell, wood, to, ...
6752    [ussr, corn, buying, may, be, 3.5, mln, tonnes...
5849    [more, heavy, rains, in, argentine, grain, are...
309     [fed, expected, to, add, reserves, new, york, ...
Name: clean_special, Length: 9136, dtype: object

## Stop Words Removal

In [19]:
train_data['clean_stopwords'] = train_data['clean_special'].apply(utils.remove_stopword)

In [20]:
train_data['clean_stopwords']

228     [fed, data, indicate, policy, change, likely, ...
5020    [volcker, urges, greater, exchange, rate, stab...
7126    [say, new, dow, components, aid, average, new,...
6823    [egypt, rejects, veg, oil, offers, , retender,...
6785    [u.s., said, promised, bulk, maize, export, sp...
                              ...                        
2530    [holly, sugar, corp, , hly, , sets, regular, d...
1690    [oregon, lumber, company, sell, wood, iraq, po...
6752    [ussr, corn, buying, may, 3.5, mln, tonnes, , ...
5849    [heavy, rains, argentine, grain, areas, , auth...
309     [fed, expected, add, reserves, new, york, , ma...
Name: clean_stopwords, Length: 9136, dtype: object

## Lemmatization

In [21]:
train_data['lemmatized'] = train_data['clean_stopwords'].apply(utils.lemmatize_sentence)

In [22]:
train_data['lemmatized']

228     [fed, data, indicate, policy, change, likely, ...
5020    [volcker, urge, greater, exchange, rate, stabi...
7126    [say, new, dow, component, aid, average, new, ...
6823    [egypt, reject, veg, oil, offer, , retender, l...
6785    [u.s., said, promised, bulk, maize, export, sp...
                              ...                        
2530    [holly, sugar, corp, , hly, , set, regular, di...
1690    [oregon, lumber, company, sell, wood, iraq, po...
6752    [ussr, corn, buying, may, 3.5, mln, tonne, , a...
5849    [heavy, rain, argentine, grain, area, , author...
309     [fed, expected, add, reserve, new, york, , mar...
Name: lemmatized, Length: 9136, dtype: object

## Remove empty elements

In [23]:
train_data['content_ready'] = train_data['lemmatized'].apply(utils.remove_extra_spaces)
train_data['content_ready']

228     [fed, data, indicate, policy, change, likely, ...
5020    [volcker, urge, greater, exchange, rate, stabi...
7126    [say, new, dow, component, aid, average, new, ...
6823    [egypt, reject, veg, oil, offer, retender, lon...
6785    [u.s., said, promised, bulk, maize, export, sp...
                              ...                        
2530    [holly, sugar, corp, hly, set, regular, divide...
1690    [oregon, lumber, company, sell, wood, iraq, po...
6752    [ussr, corn, buying, may, 3.5, mln, tonne, ams...
5849    [heavy, rain, argentine, grain, area, author, ...
309     [fed, expected, add, reserve, new, york, march...
Name: content_ready, Length: 9136, dtype: object

## Remove periods elements

In [24]:
train_data['content_ready'] = train_data['content_ready'].apply(utils.remove_periods)

# Vocabulary Creation

In [25]:
vocabulary = [word for sublist in train_data['content_ready'] for word in sublist]
print(f"Samples: {vocabulary[:15]}\nlength of vocabulary: {len(vocabulary)}")

Samples: ['fed', 'data', 'indicate', 'policy', 'change', 'likely', 'author', 'martin', 'cherrin', 'reuters', 'author', 'new', 'york', 'march', '12']
length of vocabulary: 910916


In [26]:
# Remove duplicates words
vocabulary = list(set(vocabulary))

In [27]:
print(f"Samples: {vocabulary[:15]}\nlength of vocabulary: {len(vocabulary)}")

Samples: ['boone', 'littleexplored', 'beaverton', 'mallet', '13.8', 'oda', 'refinance', '2393622', 'goldstonmorris', 'freetrading', '7072000', 'creditworthiness', 'profound', '35.1', 'stranded']
length of vocabulary: 39400


# Transforming data into numerical features using Bag-of-words

In [28]:
bow = utils.create_frequency_table(train_data['content_ready'], vocabulary)

In [29]:
# example of words occures in the first document 
bow.iloc[0][bow.iloc[0] > 0] 

one           1
recent        1
stress        1
grew          1
argue         1
             ..
raise         1
santow        1
key           1
commenting    1
rate          6
Name: 0, Length: 231, dtype: int64

# Naive Bayes Algorithm

## Calculate Prior Probabilities
 For each class, calculate the prior probability of the class, which is the number of documents in the class divided by the total number of documents.

In [30]:
# add category class to each document
bow.insert(len(bow.columns), "class_category", train_data['category'].values)

bow

Unnamed: 0,boone,littleexplored,beaverton,mallet,13.8,oda,refinance,2393622,goldstonmorris,freetrading,...,serviced,ropak,summitville,foot,manganese,renfrew,nashimoto,632100,converting,class_category
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,interest
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,money-fx
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,unknown
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sun-oil
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,corn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9131,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,earn
9132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,2,0,0,0,0,0,lumber
9133,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,corn
9134,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,grain


In [53]:
log_prior, big_doc = utils.calculate_prior_and_bigdoc(train_data['content_ready'], train_data['category'])

## Calculate Likelihood with Laplace Smoothing
 For each word in your vocabulary, calculate the likelihood of the word given each class. 

In [82]:
# Separate features and target
freq_words = bow.drop('class_category', axis=1)
categories = bow['class_category']

In [146]:
def estimate_likelihood(texts, labels, vocabulary):
    # Initialize counters for each class
    documents = [' '.join(doc) for doc in texts]

    word_counts_per_class = {label: Counter() for label in set(labels)}
    total_words_per_class = {label: 0 for label in set(labels)}
    
    # Count words in each class
    for text, label in zip(documents, labels):
        counts = Counter(text.split())  # Assuming text is preprocessed
        word_counts_per_class[label] += counts
        total_words_per_class[label] += sum(counts.values())
    
    # Number of features (unique words in vocabulary)
    num_features = len(vocabulary)

    # Calculate likelihood with Laplace smoothing
    likelihood = {
        label: {word: np.log((word_counts_per_class[label][word] + 1) / (total_words_per_class[label] + num_features))
                for word in vocabulary} for label in word_counts_per_class
    }

    return likelihood

likelihood = estimate_likelihood(train_data['content_ready'], train_data['category'], vocabulary)


In [147]:
likelihood

{'housing': {'boone': -10.666463921439238,
  'littleexplored': -10.666463921439238,
  'beaverton': -10.666463921439238,
  'mallet': -10.666463921439238,
  '13.8': -10.666463921439238,
  'oda': -10.666463921439238,
  'refinance': -10.666463921439238,
  '2393622': -10.666463921439238,
  'goldstonmorris': -10.666463921439238,
  'freetrading': -10.666463921439238,
  '7072000': -10.666463921439238,
  'creditworthiness': -10.666463921439238,
  'profound': -10.666463921439238,
  '35.1': -10.666463921439238,
  'stranded': -10.666463921439238,
  'mcgill': -10.666463921439238,
  '596000': -10.666463921439238,
  'chinkuli': -10.666463921439238,
  '9.1250': -10.666463921439238,
  '120150': -10.666463921439238,
  '160000': -10.666463921439238,
  '1637592': -10.666463921439238,
  'vahid': -10.666463921439238,
  '49.3': -10.666463921439238,
  'chavin': -10.666463921439238,
  '2.78': -10.666463921439238,
  '50.4': -10.666463921439238,
  'consolidate': -10.666463921439238,
  '100': -10.666463921439238,

# testing and evaluating

In [195]:
test_data['content'], test_data['file_name'] = utils.reorganize_data(test_data['content'], test_data['file_name'])
test_data['lowercasing'] = test_data['content'].apply(utils.lowercase_text)
test_data['tokenized'] = test_data['lowercasing'].apply(utils.tokenize_text)
test_data['clean_special'] = test_data['tokenized'].apply(utils.remove_special_chars)
test_data['clean_stopwords'] = test_data['clean_special'].apply(utils.remove_stopword)
test_data['lemmatized'] = test_data['clean_stopwords'].apply(utils.lemmatize_sentence)
test_data['content_ready'] = test_data['lemmatized'].apply(utils.remove_extra_spaces).apply(utils.remove_periods)

In [196]:
test_data

Unnamed: 0,content,category,file_name,lowercasing,tokenized,clean_special,clean_stopwords,lemmatized,content_ready
0,TIN TRADERS' RESPONSE MUTED TO KL FUTURES MARK...,tin,11164,tin traders' response muted to kl futures mark...,"[tin, traders, ', response, muted, to, kl, fut...","[tin, traders, , response, muted, to, kl, futu...","[tin, traders, , response, muted, kl, futures,...","[tin, trader, , response, muted, kl, future, m...","[tin, trader, response, muted, kl, future, mar..."
1,ITC CONTESTS USE OF DOCUMENTS AS COURT EVIDENC...,tin,11649,itc contests use of documents as court evidenc...,"[itc, contests, use, of, documents, as, court,...","[itc, contests, use, of, documents, as, court,...","[itc, contests, use, documents, court, evidenc...","[itc, contest, use, document, court, evidence,...","[itc, contest, use, document, court, evidence,..."
2,SPOT TIN EASIER ON EUROPEAN FREE MARKET LONDON...,tin,10147,spot tin easier on european free market london...,"[spot, tin, easier, on, european, free, market...","[spot, tin, easier, on, european, free, market...","[spot, tin, easier, european, free, market, lo...","[spot, tin, easier, european, free, market, lo...","[spot, tin, easier, european, free, market, lo..."
3,THAI SMELTER FACES TIN CONCENTRATE SUPPLY SHOR...,tin,11710,thai smelter faces tin concentrate supply shor...,"[thai, smelter, faces, tin, concentrate, suppl...","[thai, smelter, faces, tin, concentrate, suppl...","[thai, smelter, faces, tin, concentrate, suppl...","[thai, smelter, face, tin, concentrate, supply...","[thai, smelter, face, tin, concentrate, supply..."
4,TIN COUNCIL ALLOWED APPEAL ON USE OF DOCUMENTS...,tin,12316,tin council allowed appeal on use of documents...,"[tin, council, allowed, appeal, on, use, of, d...","[tin, council, allowed, appeal, on, use, of, d...","[tin, council, allowed, appeal, use, documents...","[tin, council, allowed, appeal, use, document,...","[tin, council, allowed, appeal, use, document,..."
...,...,...,...,...,...,...,...,...,...
4019,U.K. RESERVES RISE UNDERLYING 4.8 BILLION DLRS...,reserves,11269,u.k. reserves rise underlying 4.8 billion dlrs...,"[u.k., reserves, rise, underlying, 4.8, billio...","[u.k., reserves, rise, underlying, 4.8, billio...","[u.k., reserves, rise, underlying, 4.8, billio...","[u.k., reserve, rise, underlying, 4.8, billion...","[u.k., reserve, rise, underlying, 4.8, billion..."
4020,TAIWAN FOREIGN EXCHANGE RESERVES HIT RECORD HI...,reserves,12215,taiwan foreign exchange reserves hit record hi...,"[taiwan, foreign, exchange, reserves, hit, rec...","[taiwan, foreign, exchange, reserves, hit, rec...","[taiwan, foreign, exchange, reserves, hit, rec...","[taiwan, foreign, exchange, reserve, hit, reco...","[taiwan, foreign, exchange, reserve, hit, reco..."
4021,German net currency reserves rise 500 mln mark...,reserves,12152,german net currency reserves rise 500 mln mark...,"[german, net, currency, reserves, rise, 500, m...","[german, net, currency, reserves, rise, 500, m...","[german, net, currency, reserves, rise, 500, m...","[german, net, currency, reserve, rise, 500, ml...","[german, net, currency, reserve, rise, 500, ml..."
4022,SPAIN'S FOREIGN RESERVES RISE IN FEBRUARY MADR...,reserves,10235,spain's foreign reserves rise in february madr...,"[spain, 's, foreign, reserves, rise, in, febru...","[spain, s, foreign, reserves, rise, in, februa...","[spain, foreign, reserves, rise, february, mad...","[spain, foreign, reserve, rise, february, madr...","[spain, foreign, reserve, rise, february, madr..."


In [197]:
predicted_class = test_data['content_ready'].apply(
    lambda doc: classify_new_document_optimized(doc, vocabulary, log_prior, likelihood)
)

In [199]:
test_data['predicted_class'] = predicted_class

In [200]:
true_labels = test_data['category'].tolist() 
predicted_labels = test_data['predicted_class'].tolist()

# Calculate the F1 score
f1 = f1_score(true_labels, predicted_labels, average='macro')

print(f"F1 Score: {f1}")

F1 Score: 0.18013473398264598
