In [2]:
from post_parser_record import PostParserRecord
from collections import Counter

## Getting the top-20 frequent tags in LawSE -- There is a reason for passing 21
def get_frequent_tags(post_parser, topk=21):
  lst_tags = []
  for question_id in post_parser.map_questions:
    question = post_parser.map_questions[question_id]
    creation_date_year = int(question.creation_date.split("-")[0])
    tag = question.tags[0]
    lst_tags.append(tag)
  tag_freq_dic = dict(Counter(lst_tags))
  tag_freq_dic = dict(sorted(tag_freq_dic.items(), key=lambda item: item[1], reverse=True))
  return list(tag_freq_dic.keys())[:topk]

In [3]:
# Getting dictionary of train and test samples in form of
# key: tag value: list of tuples in form of (title, body)
def build_train_test(post_parser, lst_frequent_tags):
  dic_training = {}
  dic_test = {}
  for question_id in post_parser.map_questions:
    question = post_parser.map_questions[question_id]
    creation_date_year = int(question.creation_date.split("-")[0])
    tag = question.tags[0]
    if tag in lst_frequent_tags:
      title = question.title
      body = question.body
      if creation_date_year > 2021:
        if tag in dic_test:
          dic_test[tag].append((title, body))
        else:
          dic_test[tag] = [(title, body)]
      else:
        if tag in dic_training:
          dic_training[tag].append((title, body))
        else:
          dic_training[tag] = [(title, body)]
  return dic_test, dic_training

In [4]:
import re

# Removes HTML tags
def remove_tags(string):
    return re.sub("<[^>]*>", "", string)

In [5]:
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# Breaks up sentence into list of words with no stop words.
def sentence_to_words(str):
    tokenizer = RegexpTokenizer(r'\w+\'\w+|\w+')
    stop_words = set(stopwords.words('english'))
    words = tokenizer.tokenize(remove_tags(str.lower()))
    word_no_stop = []
    for word in words:
        if word not in stop_words:
            word_no_stop.append(word)
    return word_no_stop

In [6]:
def naive_bayes(str):
    # 
    #
    # Return
    # dict
    #   wc: count number
    #   word_freq: dict
    #      word: probability
    native_bayes_dict = {}
    word_dict = {}

    words = sentence_to_words(str)
    count = 0
    for word in words:
        if word in word_dict:
            word_dict[word] = word_dict[word] + 1
        else:
            word_dict[word] = 1
        count = count + 1
    
    for word in word_dict:
        word_dict[word] = (word_dict[word] + 1) / (count + 1) 
    
    native_bayes_dict["wc"] = count
    native_bayes_dict["word_freq"] = word_dict
    return native_bayes_dict

In [7]:
# Change a bunch of data into on string
def data_to_string(data):
    lst = []
    for i in data:
        lst.append("".join(i))
    return "".join(lst)

In [8]:

def get_native_dic(dic_training, training_data_count):
    
    native_bayes = {}

    for item in dic_training:
        native_bayes_info = {}
        native_bayes_info["probability"] = naive_bayes(data_to_string(dic_training[item]))
        native_bayes_info["normal"] = len(dic_training[item]) / training_data_count
        native_bayes[item] = native_bayes_info
    
    return native_bayes

In [9]:
import math

# Gets answer result of one native bayes model with results.
def answer_privacy_get(native_bayes, answer_privacy):
  test = {}

  for item in native_bayes:
    total = math.log(native_bayes[item]["normal"])
    for word in words:
      if word in native_bayes[item]["probability"]["word_freq"]:
        total = total + math.log(native_bayes[item]["probability"]["word_freq"][word])
      else:
        total = total + math.log(1 / (native_bayes[item]["probability"]["wc"] + 1))
    
    test[item] = total

  maxProp = 0
  maxItem = ""
  for item in test:
    if maxProp == 0 or test[item] > maxProp:
      maxProp = test[item]
      maxItem = item


  if maxItem in answer_privacy:
    answer_privacy[maxItem] = answer_privacy[maxItem] + 1
  else:
    answer_privacy[maxItem] = 1
  
  return answer_privacy

In [10]:
# Displays the results from the data to console with some statistics on F scores.
def displayAnswers(data, items):
  print("                            crimina  copyrig       US       UK   employ internat   canada intel-pr eng-wale       EU  license californ internet business   rental software contract  privacy constitu     gdpr")
  micro_precision_lst = []
  numerator_precision_total = 0
  denominator_precision_total = 0

  micro_accurate_lst = []
  numerator_accurate_total = 0
  denominator_accurate_total = 0

  for item in items:
    numerator_precision = 0
    denominator_precision = 0

    numerator_accurate = 0
    denominator_accurate = 0
    for item2 in items:
      if not item2 in data[item]:
        data[item][item2] = 0
      if not item in data[item2]:
        data[item2][item] = 0
      if item2 == item:
        numerator_precision = data[item][item]
        numerator_accurate = data[item][item]
        denominator_precision = data[item][item] + denominator_precision
        denominator_accurate = data[item][item] + denominator_accurate
      else:
        denominator_precision = data[item][item2] + denominator_precision
        denominator_accurate = data[item2][item] + denominator_accurate

    print("%25s: %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d %8d" % (item, data[item]["criminal-law"], data[item]["copyright"], data[item]["united-states"], data[item]["united-kingdom"],  data[item]["employment"], data[item]["international"], data[item]["canada"], data[item]["intellectual-property"], data[item]["england-and-wales"], data[item]["european-union"], data[item]["licensing"], data[item]["california"],  data[item]["internet"], data[item]["business"], data[item]["rental-property"], data[item]["software"], data[item]["contract-law"], data[item]["privacy"], data[item]["constitutional-law"],  data[item]["gdpr"]))
    
    numerator_precision_total = numerator_precision_total + numerator_precision
    denominator_precision_total = denominator_precision_total + denominator_precision

    numerator_accurate_total = numerator_accurate_total + numerator_accurate
    denominator_accurate_total = denominator_accurate_total + denominator_accurate

    # print()
    if denominator_precision == 0:
      micro_precision_lst.append(1)
    else:
      micro_precision_lst.append(numerator_precision / denominator_precision)
    
    if denominator_accurate == 0:
      micro_accurate_lst.append(1)
    else:
      micro_accurate_lst.append(numerator_accurate / denominator_accurate)

  macro_p = numerator_precision_total / (denominator_precision_total + numerator_precision_total)
  macro_a = numerator_accurate_total / (denominator_accurate_total + numerator_accurate_total)
  micro_p = 0
  micro_a = 0

  print("Macro Precision: " + str(macro_p))
  micro_total = 0
  print(str(micro_precision_lst))
  for num in micro_precision_lst:
    micro_total = micro_total + num
  micro_p = micro_total / len(micro_precision_lst)
  print("Micro Precision: " + str(micro_p))


  print("Macro Accurate:  " + str(macro_a))
  micro_total = 0
  print(str(micro_accurate_lst))
  for num in micro_accurate_lst:
    micro_total = micro_total + num
  micro_a = micro_total / len(micro_accurate_lst)
  print("Micro Accurate:  " + str(micro_a))


  print("F1 Macro Score:  " + str((2 * macro_p * macro_a) / (macro_p + macro_a)))
  print("F1 Micro Score:  " + str((2 * micro_p * micro_a) / (micro_p + micro_a)))

In [11]:
def get_data():
  post_parser = PostParserRecord("Posts_law.xml")
  lst_frequent_tags = get_frequent_tags(post_parser)
  # We removed contract as it had no post after 2021
  lst_frequent_tags.remove("contract")
  dic_test, dic_training = build_train_test(post_parser, lst_frequent_tags)
  print("class\t#training\t#test")
  training_data_count = 0
  for item in dic_training:
    training_data_count = training_data_count + len(dic_training[item])
    # print(str(item) + "\t" +str(len(dic_training[item]))+"\t"+str(len(dic_test[item])))

  return training_data_count, dic_training, dic_test

In [12]:
# Question titles and post
training_data_count, dic_training, dic_test = get_data()

native_bayes = get_native_dic(dic_training, training_data_count)

testResult = {}

for testItem in dic_test:
  answer_privacy = {}

  for post in dic_test[testItem]:
    words = sentence_to_words(" ".join(post)) # Question titles and post
    test = {}

    answer_privacy = answer_privacy_get (native_bayes, answer_privacy)

  testResult[testItem] = answer_privacy


itemss = dic_training.keys()

displayAnswers(testResult, itemss)

class	#training	#test
                            crimina  copyrig       US       UK   employ internat   canada intel-pr eng-wale       EU  license californ internet business   rental software contract  privacy constitu     gdpr
             criminal-law:       18        0        0        0        0        6        1        0       10        3        0        0        5       14        3        1        1        1       15        0
                copyright:        0       68        0        0        0        1        0       27        0        0       50        1        9       12        0       11        0        0        0        2
            united-states:       61       21        9        1       43       33       15       36       22       16       20       40       15      223       25       17       19       25      214        8
           united-kingdom:       13        0        0       11       18        5        2        4       35       25        0        4        3       

In [13]:
# Just question titles

training_data_count, dic_training, dic_test = get_data()

native_bayes = get_native_dic(dic_training, training_data_count)

testResult = {}

for testItem in dic_test:
  answer_privacy = {}

  for post in dic_test[testItem]:
    words = sentence_to_words(post[0]) # Question titles
    test = {}

    answer_privacy = answer_privacy_get (native_bayes, answer_privacy)

  testResult[testItem] = answer_privacy


itemss = dic_training.keys()

displayAnswers(testResult, itemss)

class	#training	#test
                            crimina  copyrig       US       UK   employ internat   canada intel-pr eng-wale       EU  license californ internet business   rental software contract  privacy constitu     gdpr
             criminal-law:       23        1       18        1        1        9        5        0        4        2        0        0        1        2        0        1        1        4        5        0
                copyright:        0      131        2        0        0        1        0       11        1        1       15        0        7        2        0        7        1        0        0        2
            united-states:       66       54      207        3       25       35       18       20       25       21       14       33       15       98       13       25       35       24      121       11
           united-kingdom:       20        1       21       57       16        9        7        6       17       16        1        5        2       

In [None]:
# Question posts
training_data_count, dic_training, dic_test = get_data()

native_bayes = get_native_dic(dic_training, training_data_count)

testResult = {}

for testItem in dic_test:
  answer_privacy = {}

  for post in dic_test[testItem]:
    words = sentence_to_words(" ".join(post[1:len(post)])) # Question posts
    test = {}

    answer_privacy = answer_privacy_get (native_bayes, answer_privacy)

  testResult[testItem] = answer_privacy


itemss = dic_training.keys()

displayAnswers(testResult, itemss)

class	#training	#test
                            crimina  copyrig       US       UK   employ internat   canada intel-pr eng-wale       EU  license californ internet business   rental software contract  privacy constitu     gdpr
             criminal-law:       17        0        1        1        1        7        1        0       10        4        0        0        3       13        3        1        1        1       14        0
                copyright:        0       61        1        0        0        1        0       23        1        0       54        2        9       15        0       11        0        1        0        2
            united-states:       56       20       18        0       42       33       12       36       22       16       21       40       13      228       26       16       18       29      209        8
           united-kingdom:       15        0        1        9       16        5        2        4       33       21        0        4        3       

: 