In [65]:
import csv
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn import datasets,svm,tree, metrics

import numpy as np


In [37]:
def load_file(fileName):
    dataset = pd.read_table(fileName, header=0, sep=",", encoding="unicode_escape")
    
    return dataset

In [38]:
# preprocess creates the term frequency matrix for the review data set
def preprocess(data):
    count_vectorizer = CountVectorizer()
    data = count_vectorizer.fit_transform(data)
    
    #tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)

    return data

In [39]:
def learn_model(data,target):
    classifier = None
    #Split the Dataset according to the Classes
    Array = data.toarray()
    array_row,array_col = Array.shape
    Classes = list(np.unique(target))
    A_given_C = np.zeros((len(Classes),array_col))
    Probability = [0]*len(Classes)

    for index  in range(len(Classes)):
        temp = Array[target == Classes[index]]
        class_row,class_col = temp.shape
        A_given_C[index] = np.sum(temp,axis=0)
        A_given_C[index]+=1
        Sum_A_given_C = np.sum(A_given_C[index])
        A_given_C[index] = A_given_C[index]/Sum_A_given_C
        Probability[index] = class_row/array_row
    Class_Probability = pd.Series(Probability,Classes)

    classifier = (A_given_C,Class_Probability,Classes)
    
    return classifier

In [40]:
def classify(classifier, testdata):
    predicted_val=[]
    A_given_C = classifier[0]
    Class_Probability = classifier[1]
    Classes = classifier[2]
    TestArray = testdata.toarray()
    
    ArrayRow,ArrayCol = TestArray.shape
    for i in range(ArrayRow): # ArrayRow
        temp_values = [0]*len(Classes)
        for j in range(len(Classes)):
            temp_values[j] = Class_Probability[Classes[j]]*A_given_C[j][TestArray[i].astype(bool)].prod()
        temp_series = pd.Series(temp_values,Classes)
        predicted_val.append(temp_series.idxmax())
  
    return predicted_val

In [88]:
def evaluate(actual_class, predicted_class):
    accuracy = -1    
    #Your code to evaluate the model will go here. The code will print overall model's accuracy  and precision 
    #and recall for each class label.
    
    Classes = list(np.unique(actual_class))
    print("The Class Labels : ")
    print(Classes)
    ConfusionMatrix = np.zeros((len(Classes),len(Classes)))
    Series_Class = pd.Series([i for i in range(len(Classes))],Classes)
    Act_Predict_Dict  = {"Actual":actual_class,"Predicted":predicted_class}
    Act_Predict_df = pd.DataFrame(Act_Predict_Dict)
    for actual in Classes:
        for predict in Classes:
            ConfusionMatrix[Series_Class[actual]][Series_Class[predict]]=int(np.sum(np.logical_and(Act_Predict_df['Actual']==actual,Act_Predict_df['Predicted']==predict))) 
    print("The confusion matrix : ")
    print(ConfusionMatrix.astype(int))
    precision = np.sum(np.diag(ConfusionMatrix)) / np.sum(np.sum(ConfusionMatrix, axis = 0))     
    accuracy = np.sum([actual_class == predicted_class]) / len(actual_class)
    recall =np.diag(ConfusionMatrix)/ np.sum(ConfusionMatrix,axis=1)

    print("The accuracy score is :",accuracy)
    print("The precision score is :",precision)
    print("The recall score for each class label is : ",recall)
    # Refrence : https://www.youtube.com/watch?v=FAr2GmWNbT0

    

In [89]:
features = ["SUMMARY", "categories", "sub_categories"]

print("Loading data.....")
dataset = load_file("TextClassification_Data.csv")
data,target = dataset[features[0]].fillna(" "), dataset[features[1]]

print("preprocessing data.....")
word_vectors = preprocess(data)
    
trainingX,testX,trainingY,testY = train_test_split(word_vectors,target,test_size=0.4,random_state=43)

print("Learning model.....")
# classifier = BernoulliNB()
# classifier.fit(trainingX,trainingY)

model = learn_model(trainingX,trainingY)

print("Classifying test data......") 
#predictedY = classifier.predict(testX)     
predictedY = classify(model, testX)

print("Evaluating results.....")
evaluate(testY,predictedY)
# print(metrics.accuracy_score(testY,predictedY))
# print(metrics.recall_score(predictedY, testY,average = 'micro'))
# print(metrics.precision_score(predictedY, testY,average = 'micro'))
# print(metrics.f1_score(predictedY,testY,average = 'micro'))
# #print(np.mean(metrics.precision_score(predictedY, testY,average = None)))

Loading data.....
preprocessing data.....
Learning model.....
Classifying test data......
Evaluating results.....
The Class Labels : 
['APPOINTMENTS', 'ASK_A_DOCTOR', 'JUNK', 'LAB', 'MISCELLANEOUS', 'PRESCRIPTION']
The confusion matrix : 
[[4196  421    0   72  478  442]
 [ 320 2912    0  100  631  678]
 [   0    0    0    0    9    1]
 [ 115  135    0 1085  289  105]
 [ 343  425    0  102 3044  969]
 [ 158  394    0  105  471 4912]]
The accuracy score is : 0.7048271648044693
The precision score is : 0.7048271648044693
The recall score for each class label is :  [0.74808344 0.62745098 0.         0.62753036 0.62338726 0.81324503]
