In [1]:
import numpy as np
import pandas as pd
import sklearn
import nltk
import os  
import re  
import copy
import math
from collections import Counter
from operator import itemgetter
from nltk.corpus import stopwords 
from project_utilities import import_train_data, import_test_data

In [2]:
os.chdir(r'C:\Users\35904\Desktop\Mcgill_Study\551ML\project2\train\train')
path = os.getcwd()

train_data = import_train_data(path)
#train_data = pd.DataFrame(import_train_data(path))
os.chdir(r'C:\Users\35904\Desktop\Mcgill_Study\551ML\project2\test\test')
path = os.getcwd()
test_data = import_test_data(path)
#test_data = pd.DataFrame(import_test_data(path))

In [3]:
# find binary features

def split_text (data_list):
    
    splited_text = []
    result = [0]*len(data_list)
    for index, value in enumerate(data_list):
        result[index] = re.split(r'[\s\,\;\:\.\?\!\)\(\'\"\/>\<]+',value['text'].lower())
    return result

def concatenate_all_text(data_list):
    """
    Concatenate and return each datapoint's text key value into one list

    Arguments:
    data_list -- list of dictionary type data to concatenate text from

    Return:
    List of all text from each data point from data_list
    """
    all_text = []
    splited_text = split_text(data_list)
    for index, value in enumerate(splited_text):
        all_text.extend(value)
    return all_text

def filter_stop_word (data_list):
    """
    Function: it is used to filte the stop word out from the top words we have got

    Arguments: 
    top_words -- List of top occuring words in the dataset

    Return: 
    top words without stop word
    """
    all_text = concatenate_all_text(data_list)
    stop_words = set(stopwords.words('english')) 
    top_word_wsw = [w for w in all_text if not w in stop_words] #wsw means without stop word
    return top_word_wsw


def get_top_words(data_list, n_top_words):
    """
    Get list of top words from given dataset

    Arguments:
    data_list -- Dataset to determine top words from 
    n_top_words -- Number of top words (default 160)

    Return:
    List of strings of the top 160 words
    """
    n_top_words = n_top_words + 1
    top_words = []
    
    d = Counter(filter_stop_word(data_list))
    d_sorted = sorted(d.items(), key=itemgetter(1), reverse=True)
    
    assert len(d_sorted) >= n_top_words, 'Too many top words'
    
    for i in range(n_top_words):
        top_words.append(d_sorted[i][0])
        
    while '' in top_words:
        top_words.remove('')  # delate the empety element

    return top_words

def occurrence_top_words (data_point, top_words):
    
    occurrence = [0]*len(top_words)
    
    for index, value in enumerate (top_words):
        occurrence[index] = int (value in data_point['text'])
    
    return occurrence

def insert_top_words_occurrence (data_list,top_words):
    result = copy.deepcopy(data_list)
    
    for index_result, value_result in enumerate (result):
        top_words_occurrence = occurrence_top_words(value_result, top_words)
        
        for index_word, word in enumerate (top_words_occurrence):
            column_name = 'top_word_occurrence_' + str(index_word + 1).zfill(3)
            result[index_result][column_name] = top_words_occurrence[index_word]

    return result

def get_num_pos_neg_train_data (dictionary):
    os.chdir(dictionary)
    path = os.getcwd()

    path_pos = os.path.join(path,'pos')
    path_neg = os.path.join(path,'neg')

    num_pos = len(os.listdir(path_pos))
    num_neg = len(os.listdir(path_neg))
    
    return num_pos,num_neg

def get_probability_pos_neg (num_pos,num_neg):
    
    Pro_pos = num_pos/(num_pos+num_neg)
    Pro_neg = num_neg/(num_pos+num_neg)
    
    return Pro_pos, Pro_neg

def split_dataset_to_pos_neg (data_list, num_pos, num_neg):
    data_list_pos = [0]*num_pos
    data_list_neg = [0]*num_neg
    index_pos = int (0)
    index_neg = int (0)
    for index, value in enumerate (data_list):
        if data_list[index]['category'] == 1:
            data_list_pos[index_pos] = data_list[index]
            index_pos = index_pos + 1
        else:
            data_list_neg[index_neg] = data_list[index]
            index_neg = index_neg + 1
    return data_list_pos, data_list_neg

def get_pro_x1_given_y1_or_x1_given_y0 (input_data):
    
    data_list = copy.deepcopy(input_data)
    
    for index, value in enumerate (data_list):
        del data_list[index]['category']
        del data_list[index]['text']
    
    features_list = list(data_list[0].keys())
    length_pos = len(data_list)
    Pro_pos_feature = [0]*len(features_list)
    add_up = 0

    for index_feature, value_feature in enumerate (features_list):
        for index_data, value_data in enumerate (data_list):
            if data_list[index_data][value_feature] == 1:
                add_up = add_up + 1
        Pro_pos_feature[index_feature] = (add_up+1)/(length_pos+2)
        add_up = 0
    
    return Pro_pos_feature

def pre_processing_test_data (input_data):
    
    id_num = []
    data_list = copy.deepcopy(input_data)
    
    for index, value in enumerate (data_list):
        id_num.append(value['id'])
        del value['id']
        del value['text']

    return data_list, id_num

def make_decision (input_data):
    
    decision_value = 0
    decision = [0]*len(input_data)
    data_list = copy.deepcopy(input_data)
    features_list = list(data_list[0].keys())
    
    for index, value in enumerate (data_list):
        for index_feature, value_feature in enumerate (features_list):
            decision_value = decision_value + value[value_feature]* math.log10(pro_x1_given_y1[index_feature]/pro_x1_given_y0[index_feature])+ (1-value[value_feature])*math.log10((1-pro_x1_given_y1[index_feature])/(1-pro_x1_given_y0[index_feature]))
        if decision_value > 0:
            decision[index] = 1
        else:
            decision[index] = 0
        decision_value = 0
    return decision

In [4]:
top_words = get_top_words(train_data, 150)
train_data = insert_top_words_occurrence (train_data, top_words)
test_data = insert_top_words_occurrence(test_data,top_words)

In [5]:
num_pos,num_neg = get_num_pos_neg_train_data(r'C:\Users\35904\Desktop\Mcgill_Study\551ML\project2\train\train')
Pro_pos, Pro_neg = get_probability_pos_neg(num_pos,num_neg)

In [6]:
train_data_pos,train_data_neg =  split_dataset_to_pos_neg (train_data, num_pos, num_neg)

In [7]:
pro_x1_given_y1 = get_pro_x1_given_y1_or_x1_given_y0(train_data_pos)
pro_x1_given_y0 = get_pro_x1_given_y1_or_x1_given_y0(train_data_neg)

In [8]:
test_data1,id_num = pre_processing_test_data(test_data)

In [9]:
category = []
for index, value in enumerate (train_data):
    category.append(value['category'])
    del train_data[index]['category']
    del train_data[index]['text']

In [10]:
decision = make_decision(train_data)

In [11]:
currect_sum = 0

for index, value in enumerate (category):
    if decision[index] == value:
        currect_sum = currect_sum + 1
        
currect_rate = currect_sum/len(category)

In [12]:
currect_rate

0.74572