In [None]:
# -------------------------------------------------------
# Project #2 Hacker News Dataset Analysis
# Written by Naitik Bhise (40106507) and Paras Kapoor (40114178)
# For COMP 6721 Section FI – Fall 2019
# --------------------------------------------------------

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline

from generateWordFrequency import *
from naiveBayes import *
from fileWriteFunctions import *

In [None]:
def drawPlot(X,Y,xlabel,ylabel,title):
    plt.figure()
    plt.scatter(X, Y, marker='*',
           s=10, facecolor='blue')
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plt.show()

## COMMON DATA LOADING 

In [None]:
WriteList(unwanted_tokens,'remove_word.txt')

In [None]:
AllClasses = ['story', 'ask_hn', 'show_hn', 'poll']
appendClassPrefix = 'prob_'

unfilteredTrainData = getDataframe(2018,"hn2018_2019.csv")
unfilteredTrainData = addTokenizedColumnofTitle(unfilteredTrainData)

unfilteredTestData = getDataframe(2019,"hn2018_2019.csv")
unfilteredTestData = addTokenizedColumnofTitle(unfilteredTestData)
unfilteredTestData = unfilteredTestData.reset_index()

In [None]:
def generateModel(trainData, delta, filename = 'temp-model.txt'):
    priorProbabilities = getPriorProbabilities(trainData)
    trainWords = getWordFrequencyDataframe(trainData,AllClasses)
    trainWords = obtainDataframeWithClassProbabilities(trainWords, AllClasses, delta, appendClassPrefix)
    writeModel(trainWords,filename,AllClasses,appendClassPrefix)
    trainWords = renameModelRows(trainWords, AllClasses, appendClassPrefix)
    model = [trainWords, priorProbabilities]
    return model

def showAccuracies(y_true,y_pred, display = False):
    cf_m = confusion_matrix(y_true, y_pred, labels=AllClasses)
    df = pd.DataFrame(cf_m,columns=['pred_' + className for className in AllClasses],
                  index=['true_' + className for className in AllClasses])
    df_ = pd.DataFrame(np.zeros((len(AllClasses),3)),
                       index=AllClasses,columns=['precision','recall','F1score'])

    for className in AllClasses:
        if df.sum(axis=0)['pred_' + className] == 0:
            precision = 0
        else:    
            precision = df['pred_' + className]['true_' + className]/df.sum(axis=0)['pred_' + className]
        if df.sum(axis=1)['true_' + className] == 0:
            recall = 0
        else:
            recall = df['pred_' + className]['true_' + className]/df.sum(axis=1)['true_' + className]
        df_['recall'][className] = recall
        df_['precision'][className] = precision
        if precision == 0 or recall == 0:
            df_['F1score'][className] = 0
        else:
            df_['F1score'][className] = 2*precision*recall/(precision + recall)
    if display:
        print(df)
        #print(df.to_latex(index=True))
        print(df_)
        #print(df_.to_latex(index=True))

def testModel(testData, model, filename = 'temp-results.txt', display = False):
    testData = generateCondClassProb(testData, model)
    testData['predicted'] = generatePrediction(testData,AllClasses)
    testResults = comparePredictions(testData,AllClasses)
    writeDataframe(testResults,filename)
    showAccuracies(testResults['Post Type'],testResults['predicted'],display)
    check = testResults.comparision.value_counts()
    accuracy = check[True]/(check[True]+check[False])
    return accuracy

## TASK 1:  Extract the data and build the model

In [None]:
trainData = unfilteredTrainData.copy()
model = generateModel(trainData, 0.5, 'model-2018.txt')

## TASK 2: Use ML Classifier to test dataset

In [None]:
testData = unfilteredTestData.copy()
accuracy = testModel(testData, model, 'baseline-result.txt', True)
print('accuracy:',accuracy)

## Task 3: Experiments with the classifier

### EXP 3.1:  Stop-word Filtering

In [None]:
filename = 'Stopwords.txt'
with open (filename, "r") as myfile:
    data = myfile.readlines()
filteredWordList = [word[0:-1] for word in data if len(word[0:-1])]
trainData = filterTokensByWordList(unfilteredTrainData.copy(),filteredWordList)
testData = filterTokensByWordList(unfilteredTestData.copy(),filteredWordList)
model = generateModel(trainData, 0.5, 'stopword-model.txt')
accuracy = testModel(testData, model, 'stopword-result.txt', True)
print(accuracy)

### EXP 3.2:   Word Length Filtering

In [None]:
trainData = filterTokensByWordLength(unfilteredTrainData.copy())
testData = filterTokensByWordLength(unfilteredTestData.copy())
model = generateModel(trainData, 0.5, 'wordlength-model.txt')
accuracy = testModel(testData, model, 'wordlength-result.txt', True)
print(accuracy)

### EXP 3.3: Infrequent Word Filtering

#### 3.3.1 COUNT BASED

In [None]:
accuracies = []
VocabularySize = []
for count in [1,5,10,15,20]:
    trainWords = getWordFrequencyDataframe(unfilteredTrainData,AllClasses)
    filteredWordList = getWordListBasedOnCount(trainWords,maxCount = count)    
    trainData = filterTokensByWordList(unfilteredTrainData.copy(),filteredWordList)
    testData = filterTokensByWordList(unfilteredTestData.copy(),filteredWordList)
    model = generateModel(trainData, 0.5)
    accuracy = testModel(testData, model)
    vocabSize = len(model[0].columns)    
    accuracies.append(accuracy)
    VocabularySize.append(vocabSize)
    print(count,accuracy,vocabSize)
drawPlot(VocabularySize,accuracies,'Vocubalary Size','Test Accuracy','Filtering based on WordCount')

#### 3.3.2 TOP X % FREQUENT WORDS

In [None]:
accuracies = []
VocabularySize = []
for percent in [5,10,15,20,25]:
    trainWords = getWordFrequencyDataframe(unfilteredTrainData,AllClasses)
    filteredWordList = getWordListBasedOnPercent(trainWords,Percent = percent)
    trainData = filterTokensByWordList(unfilteredTrainData.copy(),filteredWordList)
    testData = filterTokensByWordList(unfilteredTestData.copy(),filteredWordList)
    model = generateModel(trainData, 0.5)
    accuracy = testModel(testData, model)
    vocabSize = len(model[0].columns)    
    accuracies.append(accuracy)
    VocabularySize.append(vocabSize)
    print(percent,accuracy,vocabSize)
drawPlot(VocabularySize,accuracies,'Vocubalary Size','Test Accuracy','Filtering based on Top x% Frequent Words')

### EXP 3.3: Delta Smoothing

In [None]:
accuracies = []
Deltas = 0.1*np.arange(0,11)
for delta in Deltas:
    trainData = unfilteredTrainData.copy()
    testData = unfilteredTestData.copy()    
    model = generateModel(trainData, delta)
    accuracy = testModel(testData, model)    
    accuracies.append(accuracy)
drawPlot(Deltas,accuracies,'Delta Values','Test Accuracy','Smoothening Factor')