In [1]:
import numpy as np
import pandas as pd
import random as rd
import csv
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
nltk.download('stopwords')
nltk.download('punkt')
import string


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/LinYichen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/LinYichen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
data_train_file = np.load('data_train.pkl', allow_pickle=True)
data_test_file = np.load('data_test.pkl', allow_pickle=True)

In [3]:
train_comment_list = data_train_file[0]
train_topic_list = data_train_file[1]
test_comment_list = data_test_file

In [4]:
train_data_dict = {'Comment': train_comment_list, 'Topic': train_topic_list}

In [5]:
train_df = pd.DataFrame(train_data_dict)
# df.drop_duplicates() # Drop duplicates
# Shuffle the data

In [6]:
np.unique(train_topic_list, return_counts=True)
len(np.unique(train_topic_list))

20

In [9]:
# Export test result labels to CSV file
class CSVExporter:
    def __init__(self):
        pass
    
    def export(self, result_list):
        with open('submission.csv', 'w', newline='') as csvfile:
            csvwriter = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
            csvwriter.writerow(['Id','Category'])
            for i in range(len(result_list)):
                csvwriter.writerow([i, result_list[i]])
            
        return "Export Success!"

In [10]:
# Random Classifier, randomly assign labels to comments
class RandomClassifier:
    def __init__(self):
        pass
        
    def train(self, train_inputs, train_labels):
        #  Not really training, but just to initiate
        self.train_inputs = train_inputs
        self.train_labels = train_labels
    
    def compute_predictions(self, test_inputs):
        result_labels = []
        for i in range(len(test_inputs)):
            random_index = rd.randrange(0, len(self.train_labels) - 1, 1)
            result_labels.append(train_labels[random_index])
        return result_labels 

# Compute error rates on different classifiers    
class ErrorRate:
    # train_labels has to be the same size with validation_labels!
    def __init__(self, train_inputs, train_labels, test_inputs, test_labels):
        self.train_inputs = train_inputs
        self.train_labels = train_labels
        self.test_inputs = test_inputs
        self.test_labels = test_labels
    
    def random_classifier(self):      
        rc = RandomClassifier()
        rc.train(self.train_inputs, self.train_labels)
        result_list = rc.compute_predictions(self.test_inputs)
        error_count = 0
        for i in range(len(result_list)):
            if result_list[i] != self.test_labels[i]:
                error_count += 1
        error_rate = error_count / len(self.test_labels)       
        return error_rate
        
    def naive_bayes(self):
        nb = NaiveBayesClassifier(self.train_inputs, self.train_labels)
        nb.train()
        result_list = nb.predict(self.test_inputs)
        error_count = 0
        for i in range(len(result_list)):
            if result_list[i] != self.test_labels[i]:
                error_count += 1
        error_rate = error_count / len(self.test_labels)       
        return error_rate
        

In [11]:
train_inputs = train_df.Comment # Or train_comment_list
train_labels = train_df.Topic # Or train_topic_list
test_inputs = test_comment_list

# Random Classifier error rates
rc = RandomClassifier()
rc.train(train_inputs, train_labels)
result_list = rc.compute_predictions(test_inputs)

er = ErrorRate(train_inputs, train_labels, train_inputs, train_labels)
random_classifier_error_rate = er.random_classifier()
print('Accruracy is ', 1 - random_classifier_error_rate)


Accruracy is  0.04942857142857138


In [184]:
# csvexp = CSVExporter()
# csvexp.export(result_list)

In [12]:
import re
# Comments processor, remove punctuations and stopwords
def list_processor(stop_words, comment):
    # Remove punctuations
    comment = re.sub('[^A-Za-z0-9]+', ' ', comment)
    # Split words into string list
    word_tokens = word_tokenize(comment) 
    # Remove stopwords
    word_list = [word.lower() for word in word_tokens if not word.lower() in stop_words]
    
    return word_list

In [13]:
# Naive Bayes Classifier
class NaiveBayesClassifier:
    def __init__(self, train_comment_list, train_topic_list):
        self.train_comment_list = train_comment_list
        self.topic_list = train_topic_list
        self.unique_topics = np.unique(train_topic_list)
        self.percent_dict = {}
        self.topic_freq = []
    
    # Get word frequency dictionary in each topics
    def train(self):
        # Get topic frequency array
        for i in range(len(self.unique_topics)):
            topic_count = self.topic_list.count(self.unique_topics[i])
            total_count = len(self.topic_list)
            self.topic_freq.append(topic_count/total_count)
        # Get Percent dictionary
        stop_words = set(stopwords.words('english'))
        word_dict = {}
        for i, comment in enumerate(self.train_comment_list):
            word_list = list_processor(stop_words, comment)
            topic = self.topic_list[i]
            topic_id = np.argwhere(self.unique_topics==topic)[0][0]
            for word in word_list:
                if word in word_dict:
                    word_dict[word][topic_id] += 1
                else:
                    word_dict[word] = [0 for _ in range(len(self.unique_topics))]          
                    word_dict[word][topic_id] += 1  
        # P(w|c) = (count(w,c) + 1)/(count(c) + total words)
        count = len(word_dict.keys())            
        values = np.array(list(word_dict.values()))
        sums = np.sum(values, axis = 0)
        percents = (values + 1)/(sums + count)
        self.percent_dict = dict(zip(word_dict.keys(), percents))

    # Return: result label list    
    def predict(self, predict_comment_list):
        result = []
        stop_words = set(stopwords.words('english'))
        
        for comment in predict_comment_list:
            max_percent = float("-inf")
            label = ''
            word_list = list_processor(stop_words, comment)
            for i in range(len(self.unique_topics)):
                percent = np.log(self.topic_freq[i])
                for word in word_list:
                    if word in self.percent_dict.keys():
                        # Use log probability to avoid under flow
                        percent += np.log(self.percent_dict[word][i])         
                if percent > max_percent:
                    label = self.unique_topics[i]
                    max_percent = percent
            result.append(label)  
        return result                       
            

In [14]:
# Validation test, split dataset into train set(60000) and validation set(10000)
train_inputs= train_comment_list[:60000]
train_labels= train_topic_list[:60000]
test_inputs= train_comment_list[60000:70000]
test_labels= train_topic_list[60000:70000]
er = ErrorRate(train_inputs, train_labels, test_inputs, test_labels)

In [15]:
rate = er.naive_bayes()
print(1-rate)

0.5525


In [16]:
# Export result label list
nb = NaiveBayesClassifier(train_comment_list, train_topic_list)
nb.train()
result_list = nb.predict(test_comment_list)
csvexp = CSVExporter()
csvexp.export(result_list)

'Export Success!'