In [None]:
import numpy as np
import sklearn
import nltk
import math

from GenerateCorpusDataframe import *
from generateWordFrequency import *

In [None]:
df_2018 = getDataframe(2018)
df_2018 = addTokenizedColumnofTitle(df_2018)

In [None]:
df_2018.head()

In [None]:
AllClasses = list(np.unique(df_2018['Post Type']))
AllClasses = ['ask_hn', 'poll', 'show_hn', 'story']
delta = 0.5

## Calculating frequency of each word and given their conditional post type

In [None]:
words_2018 = getWordFrequencyDataframe(df_2018,AllClasses)
totalNumberOfWords = getTotalWordCount(words_2018)
words_2018,absentWordConditionalProbability = obtainDataframeWithClassProbabilities(words_2018, AllClasses, delta)

In [None]:
orderedColumns = ['story', 'prob_story', 'ask_hn', 'prob_ask_hn', 'show_hn', 'prob_show_hn', 'poll', 'prob_poll']
filename = 'model-2018.txt'
writeModel(words_2018,orderedColumns,filename)

In [None]:
words_2018.head(10)

## PART 2

In [None]:
priorProbabilities = {'prob_story':0,'prob_ask_hn':0,'prob_show_hn':0,'prob_poll':0}
unique, counts = np.unique(df_2018['Post Type'], return_counts=True)
for index in range(len(unique)):
    priorProbabilities['prob_' + unique[index]] = counts[index]/np.sum(counts)
print(priorProbabilities)

In [None]:
conditionalProbabilities = words_2018.drop(['story', 'poll', 'show_hn', 'ask_hn'])
conditionalProbabilities.head(10)

In [None]:
def getConditionalProbability(word,className,absentWordConditionalProbability,conditionalProbabilities):
    try:
        return conditionalProbabilities[word][className]
    except:
        return absentWordConditionalProbability[className]

In [None]:
def generateScore(arrayOfTokenizedTitle,className,priorProbabilities,absentWordConditionalProbability,conditionalProbabilities):
    score = np.zeros(len(arrayOfTokenizedTitle)) + math.log10(priorProbabilities[className])
    for index in range(len(arrayOfTokenizedTitle)):
        wordsList = arrayOfTokenizedTitle[index]
        for word in wordsList:
            score[index] += math.log10(getConditionalProbability(word,className,absentWordConditionalProbability,conditionalProbabilities))
    return score

In [None]:
testData = data[data["year"]=='2019'][['Title','Post Type']].copy()
testData['Title'] = testData['Title'].map(lambda x:x.lower())
testData['tokenized_title'] = testData['Title'].map(lambda x:re.split('\[\^a-zA-Z\]',x)[0].split())
testData = testData.reset_index()

In [None]:
testData.head()

In [None]:
for className in priorProbabilities.keys():
    testData[className] = generateScore(testData['tokenized_title'],className)
testData.head()

In [None]:
columnNamesExchange = {}
for className in ['ask_hn', 'poll', 'show_hn', 'story']:
    columnNamesExchange['prob_' + className] = className
dfObj = testData[['prob_ask_hn', 'prob_poll', 'prob_show_hn', 'prob_story']].copy()
dfObj = dfObj.rename(columns=columnNamesExchange)
dfObj['predicted'] = dfObj.idxmax(axis=1)
dfObj = pd.concat([dfObj, testData[['Title', 'Post Type']]], axis=1)
dfObj['comparision'] = (dfObj['predicted'] == dfObj['Post Type'])
cols = ['Title', 'predicted', 'story', 'ask_hn', 'show_hn', 'poll', 'Post Type', 'comparision']
dfObj = dfObj[cols]
dfObj.head()

In [None]:
filename = 'result.txt'
dfObj.to_csv(filename, header = None, index = False, sep = ' ', mode = 'w')
with open(filename, 'r') as f:
    lines = f.readlines()
lines = [line.replace(' ', '  ') for line in lines]
with open(filename, 'w') as f:
    f.writelines(lines)

In [None]:
check = dfObj.comparision.value_counts()

In [None]:
dict(check).keys()

In [None]:
accuracy = check[True]/(check[True]+check[False])

In [None]:
accuracy

In [None]:
def model_making(df_train, df_test, labels, delta, file=True, stopwords = None, blocks = None, frequency = None):
    df_train['Title'] = df_train['Title'].map(lambda x:x.lower())
    df_test['Title'] = df_test['Title'].map(lambda x:x.lower())
    if blocks:
        df_train['tokenized_title'] = df_train['Title'].map(lambda x:[i for i in re.split('\[\^a-zA-Z\]',x)[0].split() if (len(i)>blocks[0] and len(i)<blocks[1])])
        df_test['tokenized_title'] = df_test['Title'].map(lambda x:[i for i in re.split('\[\^a-zA-Z\]',x)[0].split() if (len(i)>blocks[0] and len(i)<blocks[1])])
    elif stopwords:
        df_train['tokenized_title'] = df_train['Title'].map(lambda x:[i for i in re.split('\[\^a-zA-Z\]',x)[0].split() if i not in stopwords])
        df_test['tokenized_title'] = df_test['Title'].map(lambda x:[i for i in re.split('\[\^a-zA-Z\]',x)[0].split() if i not in stopwords])
    else:
        df_train['tokenized_title'] = df_train['Title'].map(lambda x:re.split('\[\^a-zA-Z\]',x)[0].split())
        df_test['tokenized_title'] = df_test['Title'].map(lambda x:re.split('\[\^a-zA-Z\]',x)[0].split())
    d = {}
    totalNumberOfWords = 0
    uniqueWords = 0
    print("Training Started ")
    for i in range(len(df_train['tokenized_title'])):
        d = input_phrase(df_train['tokenized_title'][i],df_train['Post Type'][i],d,labels)
        totalNumberOfWords += len(df_train['tokenized_title'][i])
    df_train = pd.DataFrame(d)
    if frequency:
        df_train[df_train<frequency] = 0
    df_train = df_train.transpose()
    uniqueWords = len(df_train)
    absentWordConditionalProbability = {}
    list_classes = []
    for className in labels:
        wordsPerClass = np.sum(df_train[className])
        conditionalClassLabel = 'prob_' + className
        df_train[conditionalClassLabel] = df_train[className].map(lambda x: (int(x) + delta)/( wordsPerClass + delta*uniqueWords ))    
        absentWordConditionalProbability[conditionalClassLabel] = delta/( wordsPerClass + delta*uniqueWords )
        list_classes.append(className)
        list_classes.append(conditionalClassLabel)
    writeWords = df_train.copy()
    words_train = df_train.transpose()
    if file:
        print("Writing the training model to file ")
        writeWords.index.name = 'TokenName'
        writeWords = writeWords.reset_index()
        writeWords = writeWords.sort_values(by ='TokenName')
        writeWords = writeWords.reset_index()
        cols = ['TokenName'] + list_classes
        print(cols)
        writeWords = writeWords[cols]
        writeWords.index += 1
        filename = 'model-2018.txt'
        writeWords.to_csv(filename, header = None, index = True, sep = ' ', mode = 'w')
        with open(filename, 'r') as f:
            lines = f.readlines()
        lines = [line.replace(' ', '  ') for line in lines]
        with open(filename, 'w') as f:
            f.writelines(lines)
    print("Testing started")
    conditionalProbabilities = words_train.drop(labels)
    unique, counts = np.unique(df_2018['Post Type'], return_counts=True)
    priorProbabilities = {}
    for index in range(len(unique)):
        priorProbabilities['prob_' + unique[index]] = counts[index]/np.sum(counts)
    for className in list(priorProbabilities.keys()):
        df_test[className] = generateScore(df_test['tokenized_title'],className,priorProbabilities,absentWordConditionalProbability,conditionalProbabilities)
    columnNamesExchange = {}
    for className in labels:
        columnNamesExchange['prob_' + className] = className
    dfObj = testData[list(priorProbabilities.keys())].copy()
    dfObj = dfObj.rename(columns=columnNamesExchange)
    dfObj['predicted'] = dfObj.idxmax(axis=1)
    dfObj = pd.concat([dfObj, testData[['Title', 'Post Type']]], axis=1)
    dfObj['comparision'] = (dfObj['predicted'] == dfObj['Post Type'])
    cols = ['Title'] + labels +['Post Type', 'comparision']
    #cols = ['Title', 'story', 'ask_hn', 'show_hn', 'poll', 'Post Type', 'comparision']
    dfObj = dfObj[cols]
    if file:
        print("Writing a Test file")
        filename = 'result.txt'
        dfObj.to_csv(filename, header = None, index = False, sep = ' ', mode = 'w')
        with open(filename, 'r') as f:
            lines = f.readlines()
        lines = [line.replace(' ', '  ') for line in lines]
        with open(filename, 'w') as f:
            f.writelines(lines)
    check = dfObj.comparision.value_counts()
    accuracy = check[True]/(check[True]+check[False])
    print("Training Accuracy :",round(accuracy,2))

In [None]:
model_making(df_2018,testData,list(np.unique(df_2018['Post Type'])),0.5,blocks = [2,9])