#Importing Libraries

In [1]:
import nltk
import sklearn
import string
import pickle	# this is for saving and loading your trained classifiers.

from sklearn.svm import SVC
from nltk.classify.scikitlearn import SklearnClassifier
from nltk import NaiveBayesClassifier

#Preprocessing
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk import FreqDist
from nltk.lm import Vocabulary
from nltk.util import ngrams

#Evaluating
from nltk.metrics import ConfusionMatrix


import random

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

#Constant values

In [2]:
genres=["philosophy", "science-fiction", "romance", "horror", "science", "religion", "mystery", "sports"]
# If txt files are not in same folder with notebook this should be added here
train_path = "" 
dev_path = ""
test_path = ""

#File operations

In [3]:
def read_file_lines(file_path):
    file = open(file_path, 'r',encoding='utf-8')
    lines = file.read().splitlines()
    file.close()
    return lines

def save_classifier(classifier, filename):	#filename should end with .pickle and type(filename)=string
	with open(filename, "wb") as f:
		pickle.dump(classifier, f)
	return
def load_classifier(filename):	#filename should end with .pickle and type(filename)=string
	classifier_file = open(filename, "rb")
	classifier = pickle.load(classifier_file)
	classifier_file.close()
	return classifier
def write_list(lst,name):
	with open(name, 'w') as file:
		for item in lst:
			file.write(f"{item}\n")

#Preproccess Operations

In [4]:
def preprocess(texts,class_name):
    lemmatizer = WordNetLemmatizer()
    processed = []
    stop_words = set(stopwords.words('english'))
    total_book_count = len(texts)//2
    for i in range(total_book_count):
        header = texts[2*i]
        text = texts[2*i+1]

        processed_text = text.lower()
        words_text = word_tokenize(processed_text)
        processed_header = header.lower()
        words_header = word_tokenize(processed_header)

        words_text =words_header + words_text
        processed_words_text = [lemmatizer.lemmatize(word) for word in words_text if word.isalnum() and (word not in stop_words) and len(word) > 1]
        processed_words_header = ["HEAD_" + lemmatizer.lemmatize(word) for word in words_header  if word.isalnum() and (word not in stop_words) and len(word) > 1]

        # porter_stemmer = PorterStemmer()
        # processed_words_text = [porter_stemmer.stem(word) for word in words_text if word.isalnum() and (word not in stop_words) and len(word) > 1]
        # processed_words_header = ["HEAD_" + porter_stemmer.stem(word) for word in words_header  if word.isalnum() and (word not in stop_words) and len(word) > 1]

        processed_text = ' '.join(processed_words_text)
        processed_header = ' '.join(processed_words_header)

        book = processed_header + " " + processed_text
        processed.append((book ,class_name))
    return processed

def create_megadoc(dataset_type,path=""): #dataset_type type as train, test, dev
    file_extension = "_"+dataset_type + ".txt"
    megadoc = []
    for genre in genres:
        file_name = genre + file_extension
        txt = read_file_lines(path + file_name)
        megadoc += preprocess(texts=txt,class_name=genre)
    return megadoc

def create_vocabulary(mega_doc):
    all_words = [word for text_class_pair in mega_doc for word in text_class_pair[0].split()]
    vocab = Vocabulary(all_words, unk_cutoff=10)
    return vocab
def split_mega_doc(megadoc):
    genre_texts={genre:[] for genre in genres}
    for text, label in megadoc:
        genre_texts[label] += [text]
    return genre_texts


Creating mega documents

In [5]:
train_megadoc = create_megadoc(dataset_type="train")
test_megadoc = create_megadoc(dataset_type = "test")
dev_megadoc = create_megadoc(dataset_type = "dev")
print(f"train_megadoc size = {len(train_megadoc)}")
print(f"dev_megadoc size = {len(dev_megadoc)}")
print(f"test_megadoc size = {len(test_megadoc)}")

train_megadoc size = 6536
dev_megadoc size = 933
test_megadoc size = 1865


## Feature Extraction

In [6]:
def extract_features(megadoc,features):
    features_labels = []
    for text,label in megadoc:
        cur_features = {}
        for feature in features:
            cur_features[feature] = feature in text
        features_labels.append((cur_features,label))
    return features_labels

def get_ngrams(text:str,n):
    tokens = ngrams(word_tokenize(text),n)
    tokens = [" ".join(ngram) for ngram in tokens]
    return tokens

def most_frequent_ngrams(texts:list,count,ngram_n):
    whole_texts = " ".join(texts)
    tokens = get_ngrams(whole_texts,ngram_n)
    freqs = FreqDist(tokens).most_common(count)
    # print(freqs)
    words = [word for word,freq in freqs]
    return words

def most_freq_ngram_each_label(megadoc,count,ngram_n):
    splitted_megadoc = split_mega_doc(train_megadoc)
    freq_words = []
    for label in splitted_megadoc.keys():
        freq_words.extend(most_frequent_ngrams(splitted_megadoc[label],count[label],ngram_n))

    return list(set(freq_words))

# Training

In [7]:
def train(classifier, training_set):
    return classifier.train(training_set)


def test(classifier, test_set):
    actual_labels = [label for feature,label in test_set]
    feature_list = [feature for feature,label in test_set]
    predictions = classifier.classify_many(feature_list)
    confusion_matrix = ConfusionMatrix(actual_labels,predictions)
    print(confusion_matrix.pretty_format())
    print(confusion_matrix.evaluate())

    accuracy_score = nltk.scores.accuracy(actual_labels, predictions)
    print(f"Accuracy: {accuracy_score}")
    return confusion_matrix



# Analyze Data

In [8]:
splitted_megadoc = split_mega_doc(train_megadoc)
for key in splitted_megadoc.keys():
    print(key)
    book_lst = splitted_megadoc[key]
    print(len(book_lst))
    text = " ".join(book_lst)
    words = word_tokenize(text)
    print(f"word count: {len(words)}")
    print("---------")

philosophy
798
word count: 67852
---------
science-fiction
840
word count: 66522
---------
romance
798
word count: 70148
---------
horror
832
word count: 63491
---------
science
805
word count: 91706
---------
religion
805
word count: 80866
---------
mystery
840
word count: 72274
---------
sports
818
word count: 82386
---------


#Naive Bayes

## Unigrams
In this part I am trying to use most frequent unigrams in each class as feature.

I am taking most frequent 500 words in each class since there are many same words in each class. The feature set has size nearly 1500.

In [9]:
counts_per_label={genre:500 for genre in genres}
unigrams = most_freq_ngram_each_label(train_megadoc,counts_per_label,1)
print(len(unigrams))
print(unigrams[:10])

1429
['destroyed', 'want', 'robot', 'subject', 'best', 'bone', 'root', 'prayer', 'church', 'design']


In [10]:
# training_set = extract_features(train_megadoc,unigrams)
# dev_set = extract_features(dev_megadoc,unigrams)

In [11]:
# naive_classifier_unig = train(NaiveBayesClassifier,training_set)
# save_classifier(naive_classifier_unig,"naive_bayes_unigram_freqs.pickle")
# naive_bayes_unig_cm = test(naive_classifier_unig,dev_set)

In [12]:
# naive_classifier_unig.show_most_informative_features(100)

## Bigrams
In this part I am trying to use most frequent bigrams in each class as feature.

In [13]:
counts_per_label={genre:150 for genre in genres}
bigrams = most_freq_ngram_each_label(train_megadoc,counts_per_label,2)
print(len(bigrams))
print(bigrams[:3])

851
['story collection', 'history christianity', 'take one']


In [14]:
# training_set = extract_features(train_megadoc,bigrams)
# dev_set = extract_features(dev_megadoc,bigrams)

In [15]:
# naive_classifier_bigr = train(NaiveBayesClassifier,training_set)
# save_classifier(naive_classifier_bigr,"naive_bayes_bigram_freqs.pickle")
# naive_bayes_bigr_cm = test(naive_classifier_bigr,dev_set)

In [16]:
# naive_classifier_bigr.show_most_informative_features(300)

## Trigrams

In [17]:
# counts_per_label={genre:80 for genre in genres}
# # counts_per_label["science-fiction"] = 40
# trigrams = most_freq_ngram_each_label(train_megadoc,counts_per_label,3)
# print(len(trigrams))
# # print(trigrams[:3])

In [18]:
# training_set = extract_features(train_megadoc,trigrams)
# dev_set = extract_features(dev_megadoc,trigrams)

In [19]:
# naive_classifier_trig = train(NaiveBayesClassifier,training_set)
# naive_bayes_trig_cm = test(naive_classifier_trig,dev_set)

In [20]:
# naive_classifier_trig.show_most_informative_features(100)

In [21]:
# # naive_classifier_trig.show_most_informative_features(100)
# trigram_most_inf = [trigram for trigram,correct in naive_classifier_trig.most_informative_features(100) if correct]
# print(trigram_most_inf)

## Combining n-grams

Before running the cell below cells assigning the unigram and bigram should be run.

In [22]:
trigram_most_inf = ['york time bestselling', 'time bestselling author', 'major league baseball', 'edgar allan poe', 'high school football', 'york time bestseller', 'world war ii', 'usa today bestselling', 'two thousand year', 'novel new york', 'science fiction novel', 'new york city', 'time book review', 'york time book', 'change way see', 'science fiction adventure', 'librarian note alternate', 'time literary supplement', 'note alternate cover', 'make u human', 'tour de force', 'world fantasy award', 'time usa today', 'york time usa', 'orson scott card', 'alternate cover edition', 'today bestselling author', 'must work together', 'los angeles time', 'million year ago', 'alternate cover isbn', 'ralph waldo emerson', 'second world war', 'cover edition found', 'john stuart mill', 'one new york', 'nature space time', 'dream come true', 'cover edition isbn', 'professional hockey player', 'one science fiction', 'new york time', 'find falling love', 'installment new york', 'since first publication', 'thing need know', 'national book award', 'shed new light', 'waitress sookie stackhouse', 'way see world', 'school football team', 'author new york', 'time bestselling novel', 'twenty year ago', 'one give full', 'five year ago', 'make matter worse', 'lieutenant eve dallas', 'essential reading anyone', 'theory natural selection', 'thousand year ago', 'arthur conan doyle', 'edition isbn found', 'new world order', 'everything thought knew', 'hit rock bottom', 'alternative cover edition', 'science fiction author', 'science fiction writer', 'stephen jay gould', 'boy next door', 'four year ago', 'brave new world', 'cover isbn found', 'get life back', 'one thing certain', 'york time author', 'one got away', 'year high school', 'romantic comedy novel', 'man woman child', 'anthropologist temperance brennan', 'ayaan hirsi ali', 'come back haunt', 'forensic anthropologist temperance', 'hundred year future', 'year passed since', 'forced question everything', 'masterpiece science fiction', 'science fiction classic', 'desert planet arrakis', 'cocktail waitress sookie', 'last thing expects', 'world around u', 'bestselling mortal instrument', 'love first sight', 'never knew existed', 'would change life', 'da vinci code', 'one twentieth century']
ngram_features = unigrams + bigrams + trigram_most_inf
print(len(ngram_features))

2380


In [23]:
training_set = extract_features(train_megadoc,ngram_features)
dev_set = extract_features(dev_megadoc,ngram_features)

In [24]:
naive_classifier_ngr = train(NaiveBayesClassifier,training_set)
save_classifier(naive_classifier_ngr,"naive_bayes_ngram_freqs.pickle")
naive_bayes_ngr_cm = test(naive_classifier_ngr,dev_set)

                |                    s    |
                |                    c    |
                |                    i    |
                |                    e    |
                |                    n    |
                |        p           c    |
                |        h           e    |
                |        i  r        -    |
                |     m  l  e  r  s  f    |
                |  h  y  o  l  o  c  i  s |
                |  o  s  s  i  m  i  c  p |
                |  r  t  o  g  a  e  t  o |
                |  r  e  p  i  n  n  i  r |
                |  o  r  h  o  c  c  o  t |
                |  r  y  y  n  e  e  n  s |
----------------+-------------------------+
         horror |<80>15  2  .  9  1 10  1 |
        mystery | 13<95> .  .  9  .  2  1 |
     philosophy |  3  2<82>14  . 11  2  . |
       religion |  6  3 27<69> 3  3  4  . |
        romance |  6  1  1  2<90> . 10  4 |
        science |  5  1 11  3  .<86> 9  . |
science-fiction | 18  5  .  1  7

## Naive Bayes Testing
Testing model with test_set by using model in n-grams which is combination of unigram+bigram+trigram.

In [25]:
final_features = ngram_features
test_set = extract_features(test_megadoc,ngram_features)
naive_bayes_ngr_cm = test(naive_classifier_ngr,test_set)

                |                           s     |
                |                           c     |
                |                           i     |
                |                           e     |
                |                           n     |
                |           p               c     |
                |           h               e     |
                |           i   r           -     |
                |       m   l   e   r   s   f     |
                |   h   y   o   l   o   c   i   s |
                |   o   s   s   i   m   i   c   p |
                |   r   t   o   g   a   e   t   o |
                |   r   e   p   i   n   n   i   r |
                |   o   r   h   o   c   c   o   t |
                |   r   y   y   n   e   e   n   s |
----------------+---------------------------------+
         horror |<167> 21   3   3  17   .  23   . |
        mystery |  29<180>  .   2  21   1   7   . |
     philosophy |   5   1<171> 21   2  20   8   . |
       relig

Accuracy: 0.7018766756032172

# SVC

In [26]:
training_set = extract_features(train_megadoc,final_features)
dev_set = extract_features(dev_megadoc,final_features)

In [27]:
SVC_classifier = train(SklearnClassifier(SVC()),training_set)
save_classifier(SVC_classifier,"SVC.pickle")
SVC_cm = test(SVC_classifier,dev_set)

                |                    s    |
                |                    c    |
                |                    i    |
                |                    e    |
                |                    n    |
                |        p           c    |
                |        h           e    |
                |        i  r        -    |
                |     m  l  e  r  s  f    |
                |  h  y  o  l  o  c  i  s |
                |  o  s  s  i  m  i  c  p |
                |  r  t  o  g  a  e  t  o |
                |  r  e  p  i  n  n  i  r |
                |  o  r  h  o  c  c  o  t |
                |  r  y  y  n  e  e  n  s |
----------------+-------------------------+
         horror |<73>14  3  .  9  . 18  1 |
        mystery | 18<89> .  .  9  .  4  . |
     philosophy |  2  2<70>22  1 14  3  . |
       religion |  1  4 19<79> 3  4  5  . |
        romance |  6  5  2  .<84> . 10  7 |
        science |  4  .  5  5  .<92> 8  1 |
science-fiction | 20  6  1  2  9

## SVC Testing

In [28]:
test_set = extract_features(test_megadoc,final_features)
SVC_cm = test(SVC_classifier,test_set)

                |                           s     |
                |                           c     |
                |                           i     |
                |                           e     |
                |                           n     |
                |           p               c     |
                |           h               e     |
                |           i   r           -     |
                |       m   l   e   r   s   f     |
                |   h   y   o   l   o   c   i   s |
                |   o   s   s   i   m   i   c   p |
                |   r   t   o   g   a   e   t   o |
                |   r   e   p   i   n   n   i   r |
                |   o   r   h   o   c   c   o   t |
                |   r   y   y   n   e   e   n   s |
----------------+---------------------------------+
         horror |<163> 25   3   1  17   1  23   1 |
        mystery |  36<169>  1   2  19   .  12   1 |
     philosophy |   1   1<161> 31   2  25   7   . |
       relig