In [184]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split


In [185]:
def load_file(fileName):
    dataset = pd.read_csv(fileName, header=0, sep=",", encoding="unicode_escape")
    return dataset
  
dataPath = 'data/TextClassification_Data.csv'
data = load_file(dataPath)
#data.head(5)

In [186]:
# preprocess creates the term frequency matrix for the review data set
def preprocess(data):
    count_vectorizer = CountVectorizer()
    data = count_vectorizer.fit_transform(data)
    
    #tfidf_data = TfidfTransformer(use_idf=False).fit_transform(data)
    #print(data[0].toarray())
    #print(count_vectorizer.vocabulary_[i[0][1]])
    return data, count_vectorizer

In [187]:
'''
print(data.toarray().shape)
for i in range(data.shape[0]):
    print(np.where(data[i].toarray() > 0)[1])
    print(data[i].toarray()[np.where(data[i].toarray() > 0)[1]])
    break;
'''

'\nprint(data.toarray().shape)\nfor i in range(data.shape[0]):\n    print(np.where(data[i].toarray() > 0)[1])\n    print(data[i].toarray()[np.where(data[i].toarray() > 0)[1]])\n    break;\n'

In [188]:
def learn_model(data, target):
    #Your custom implementation of NaiveBayes classifier will go here.
    classes, counts = np.unique(np.array(target), return_counts=True)
    classCount = dict([(i, j) for i,j in zip(classes, counts)])
    sumClass = sum(counts)
    classifier = dict()
    
    seenWordsEachClass = dict([(i, set()) for i in classes])
    #Your custom implementation of NaiveBayes classifier will go here.
    for i, clas in zip(data, target): 
        tempdata = i.toarray()[0]
        nonZeros = np.where(tempdata > 0)[0] #basically index of words present in the current sentence
                                              #(index here is with respect to bag of words)
        for j in nonZeros:
            if j not in seenWordsEachClass[clas]:
                seenWordsEachClass[clas].add(j)
            if j not in classifier:
                classifier[j] = dict([(cl, [0, count]) for cl, count in classCount.items()])
            classifier[j][clas][0] += tempdata[j]  
    seenWords = set()
    #Laplacian smoothing:n
    for word in range(len(data[0].toarray()[0])):
        if word in classifier:
            if word not in seenWords:
                seenWords.add(word)
                for prob in classifier[word]:  
                      classifier[word][prob][0] += 1
        else:
            classifier[word] = dict([(cl, [1, count]) for cl, count in classCount.items()])

    for j in classifier: 
        for i in classifier[j]:
            classifier[j][i][0] /= (classifier[j][i][1] + len(seenWordsEachClass[i]))
            classifier[j][i][1] /= sumClass
    return classifier

In [189]:
def classify(classifier, testdata):
    predicted_val=[]
    classProbs = dict([(i, classifier[0][i][1]) for i in classifier[0]])
    #Your code to classify test data using the learned model will go here
    probClassesGivenSentence = dict([(j, 1) for j in classifier[0]])
    for i in testdata:
        probClassesGivenSentence = dict([(j, 1) for j in classifier[0]])
        arr = i.toarray()[0]
        nonZeros = np.where(arr > 0)[0]
        probs = dict([(j, classifier[j]) for j in nonZeros])
        for x in probs:
            for y in probs[x]:               
                probClassesGivenSentence[y] *= probs[x][y][0]
        probClassesGivenSentence = dict([(y, probClassesGivenSentence[y]*classProbs[y]) for y in probClassesGivenSentence])
        prediction = max(probClassesGivenSentence, key=lambda k: probClassesGivenSentence[k]) 
        predicted_val.append(prediction)   
   
    return predicted_val

In [190]:
def evaluate(actual_class, predicted_class):
    #Your code to evaluate the model will go here. The code will print overall model's accuracy and precision 
    #and recall for each class label.
    count = 0
    for i in range(len(actual_class)):
        if actual_class[i] == predicted_class[i]:
            count += 1
    accuracy = count*100/len(actual_class)
    
    print("The accuracy score is :",accuracy)

In [191]:
features = ["SUMMARY", "categories", "sub_categories"]

print("Loading data.....")
dataset = load_file(dataPath)
data, target = dataset[features[0]].fillna(" "), dataset[features[1]]

print("preprocessing data.....")
word_vectors, cv = preprocess(data)
#print(cv.get_feature_names()[2415], cv.get_feature_names()[7090])
trainingX,testX,trainingY,testY = train_test_split(word_vectors,target,test_size=0.4,random_state=43)
#print(trainingX.shape,testX.shape,trainingY.shape,testY.shape)
print("Learning model.....")
model = learn_model(trainingX,trainingY)
testY = testY.tolist()
print("Classifying test data......")      
predictedY = classify(model, testX)
classes, counts = np.unique(np.array(predictedY), return_counts=True)
print("Evaluating results.....")
evaluate(testY,predictedY)

Loading data.....
preprocessing data.....
Learning model.....
Classifying test data......
Evaluating results.....
The accuracy score is : 39.114874301675975
