<a href="https://colab.research.google.com/github/51stDimension/AIML/blob/main/Experiments/TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
import pandas as pd
import re
import numpy as np
import math

In [9]:
def labelPredictions(Y):
    labels = {}
    total = len(Y)
    
    for label in Y:            
        if label in labels:
            labels[label] += 1
        else:
            labels[label] = 1
    for i in labels:
        val = labels[i]
        labels[i] = val / total;

    return labels

def split_words(sentence):
    words = re.sub("[^\w]", " ",  sentence).split()
    words = list(map(lambda x:x.lower(),words))
    return words

def split_words_unique(sentence):
    words = split_words(sentence)
    _words = {}
    for w in words:
        if w not in _words:
            _words[w] = 1
        else:
            _words[w] += 1
    return _words

def calculateVocabulary(X):
    amount = 0
    stack = []

    for sentence in X:
        words = split_words(sentence)
        

        for w in words:
            if w not in stack:
                stack.append(w)
                amount += 1

    return amount

def determineWordsCount(X):
    count = 0

    for sentence in X:
        words = split_words(sentence)

        for w in words:
            count += 1

    return count

def getWordCountInClass(payload,word,c):
    df = dataFrameForClass(payload,c)

    sentences = df[payload['f_text']]

    count = 0

    for sentence in sentences:
        words = split_words(sentence)

        for w in words:
            if w == word:
                count += 1

    return count

def dataFrameForClass(payload,c):
    return payload['X'].loc[payload['X'][payload['f_label']] == c]

def getWordsCount(payload,c):
    df = dataFrameForClass(payload,c)

    return determineWordsCount(df[payload['f_text']])

def fit(X,Y,f_text = 'text',f_label = 'label'):

    payload = {};
    
    payload['classes'] = set(Y)
    payload['predictions'] = labelPredictions(Y)
    payload['vocabulary'] = calculateVocabulary(X[f_text])
    payload['X'] = X
    payload['Y'] = Y
    payload['f_text'] = f_text
    payload['f_label'] = f_label

    return payload

def predict(payload,text):
    words = split_words_unique(text)
    m_estimate = {}
    for c in payload['classes']:
        n = getWordsCount(payload,c)

        m_estimate[c] = {}

        for word in words:
            force = words[word]

            n_c = getWordCountInClass(payload,word,c)

            _estimate = (n_c + 1) / (n + payload['vocabulary'])
            
            _estimate = math.pow(_estimate,force)

            m_estimate[c][word] = _estimate

    tags = {}
    
    for c in payload['predictions']:
        p = payload['predictions'][c]

        m = np.prod(list(m_estimate[c].values()))

        final = m * p

        tags[c] = final

        
    return max(tags,key= lambda x: tags[x])

In [10]:
df = pd.read_csv('https://raw.githubusercontent.com/51stDimension/AIML/main/Data/TextClassification.csv')
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df,test_size=0.008)
payload = fit(train,train['label'])

In [21]:
test.shape

(45, 2)

In [35]:
count=0
train_arr =[]
test_arr=[]
for ind in test.index:
  result = predict(payload,test['text'][ind])
  train_arr.append(result)
  test_arr.append(test['label'][ind])
  # y_test = y_test.append({'label': test['label'][ind]}, ignore_index=True)
  count=count+1
  print(count,result,test['label'][ind])


1 ham ham
2 ham ham
3 ham ham
4 spam ham
5 ham ham
6 ham ham
7 ham ham
8 ham ham
9 ham ham
10 ham ham
11 ham ham
12 ham ham
13 spam spam
14 ham ham
15 ham ham
16 ham ham
17 spam spam
18 ham ham
19 spam spam
20 spam spam
21 ham ham
22 ham ham
23 ham ham
24 ham ham
25 ham ham
26 ham spam
27 ham ham
28 ham ham
29 spam spam
30 ham ham
31 ham ham
32 ham ham
33 spam spam
34 ham ham
35 ham ham
36 ham ham
37 spam spam
38 ham ham
39 ham ham
40 ham ham
41 spam spam
42 ham ham
43 ham ham
44 ham ham
45 ham ham


In [36]:
y_pred = pd.DataFrame({'label':train_arr})
y_test = pd.DataFrame({'label':test_arr})


In [37]:
metrics.accuracy_score(y_test,y_pred)

0.9555555555555556

In [38]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.97      0.97      0.97        36
        spam       0.89      0.89      0.89         9

    accuracy                           0.96        45
   macro avg       0.93      0.93      0.93        45
weighted avg       0.96      0.96      0.96        45



In [39]:
metrics.confusion_matrix(y_test,y_pred)

array([[35,  1],
       [ 1,  8]])