In [16]:
import os
import re  
import math
import sklearn
from project_utilities import import_train_data, import_test_data

### Get train_data and test_data from files：

In [17]:
os.chdir(r'C:\Users\35904\Desktop\Mcgill_Study\551ML\project2\train\train')
path = os.getcwd()
row_data = import_train_data(path)

os.chdir(r'C:\Users\35904\Desktop\Mcgill_Study\551ML\project2\test\test')
path = os.getcwd()
test_data = import_test_data(path)

### define some functions：

In [18]:
from collections import Counter
from operator import itemgetter
from nltk.corpus import stopwords 
import copy
import nltk

def split_train_validation_data (input_data):
    data_list = copy.deepcopy(input_data)
    train_data_pos = data_list[0:10000]
    validation_data_pos = data_list[10000:12500]
    train_data_neg = data_list[12500:22500]
    validation_data_neg = data_list[22500:25000]
    train_data_pos.extend(train_data_neg)
    validation_data_pos.extend(validation_data_neg)
    
    return train_data_pos, validation_data_pos

def split_text (data_list):
    
    splited_text = []
    result = [0]*len(data_list)
    for index, value in enumerate(data_list):
        result[index] = re.split(r'[\s\,\;\:\.\?\!\)\(\'\"\/>\<]+',value['text'].lower())
    return result

def concatenate_all_text(data_list):
    """
    Concatenate and return each datapoint's text key value into one list

    Arguments:
    data_list -- list of dictionary type data to concatenate text from

    Return:
    List of all text from each data point from data_list
    """
    all_text = []
    splited_text = split_text(data_list)
    for index, value in enumerate(splited_text):
        all_text.extend(value)
    return all_text

def filter_stop_word (data_list):
    """
    Function: it is used to filte the stop word out from the top words we have got

    Arguments: 
    top_words -- List of top occuring words in the dataset

    Return: 
    top words without stop word
    """
    all_text = concatenate_all_text(data_list)
    stop_words = set(stopwords.words('english')) 
    top_word_wsw = [w for w in all_text if not w in stop_words] #wsw means without stop word
    return top_word_wsw


def get_top_words(data_list, n_top_words):
    """
    Get list of top words from given dataset

    Arguments:
    data_list -- Dataset to determine top words from 
    n_top_words -- Number of top words

    Return:
    List of strings of the top words
    """
    n_top_words = n_top_words + 1 # we will delete empety element latter,so add 1 here
    top_words = []
    
    d = Counter(filter_stop_word(data_list))
    d_sorted = sorted(d.items(), key=itemgetter(1), reverse=True)
    
    assert len(d_sorted) >= n_top_words, 'Too many top words'
    
    for i in range(n_top_words):
        top_words.append(d_sorted[i][0])
        
    while '' in top_words:
        top_words.remove('')  # delate the empety element

    return top_words

def occurrence_top_words (data_point, top_words):
    '''
    Function is '1' for occurrence of a word, '0' for not occurrence of a word

    Argument:
    data_point:
    top_words:

    Return:
    A list of top words occurrence

    '''
    occurrence = [0]*len(top_words)
    
    for index, value in enumerate (top_words):
        occurrence[index] = int (value in data_point['text'])
    
    return occurrence

def insert_top_words_occurrence (data_list,top_words):
    result = copy.deepcopy(data_list)
    
    for index_result, value_result in enumerate (result):
        top_words_occurrence = occurrence_top_words(value_result, top_words)
        
        for index_word, word in enumerate (top_words_occurrence):
            column_name = 'top_word_occurrence_' + str(index_word + 1).zfill(3)
            result[index_result][column_name] = top_words_occurrence[index_word]

    return result

def get_num_pos_neg_train_data (input_data):
    
    data_list = copy.deepcopy(input_data)
    
    counter_pos = 0
    counter_neg = 0
    
    for index, value in enumerate (data_list):
        if data_list[index]['category'] == 1:
            counter_pos = counter_pos + 1
        else:
            counter_neg = counter_neg + 1
    return counter_pos,counter_neg


def get_probability_pos_neg (num_pos,num_neg):
    
    Pro_pos = num_pos/(num_pos+num_neg)
    Pro_neg = num_neg/(num_pos+num_neg)
    
    return Pro_pos, Pro_neg

def split_dataset_to_pos_neg (input_data, num_pos, num_neg):
    
    data_list = copy.deepcopy(input_data)
    data_list_pos = [0]*num_pos
    data_list_neg = [0]*num_neg
    index_pos = int (0)
    index_neg = int (0)
    for index, value in enumerate (data_list):
        if data_list[index]['category'] == 1:
            data_list_pos[index_pos] = data_list[index]
            index_pos = index_pos + 1
        else:
            data_list_neg[index_neg] = data_list[index]
            index_neg = index_neg + 1
    return data_list_pos, data_list_neg

def get_pro_x1_given_y1_or_x1_given_y0 (input_data):
    
    data_list = copy.deepcopy(input_data)
    
    for index, value in enumerate (data_list):
        del data_list[index]['category']
        del data_list[index]['text']
    
    features_list = list(data_list[0].keys())
    length_pos = len(data_list)
    Pro_pos_feature = [0]*len(features_list)
    add_up = 0

    for index_feature, value_feature in enumerate (features_list):
        for index_data, value_data in enumerate (data_list):
            if data_list[index_data][value_feature] == 1:
                add_up = add_up + 1
        Pro_pos_feature[index_feature] = (add_up+1)/(length_pos+2)
        add_up = 0
    
    return Pro_pos_feature

def make_decision (input_data):
    
    decision_value = 0
    decision = [0]*len(input_data)
    data_list = copy.deepcopy(input_data)
    
    features_list = list(data_list[0].keys())
    
    for index, value in enumerate (data_list):
        for index_feature, value_feature in enumerate (features_list):
            decision_value = decision_value + value[value_feature]* math.log10(pro_x1_given_y1[index_feature]/pro_x1_given_y0[index_feature])+ (1-value[value_feature])*math.log10((1-pro_x1_given_y1[index_feature])/(1-pro_x1_given_y0[index_feature]))
            # because p(y = 1) = p(y = 0),the log(p(y = 1)/p(y = 0)) = 0, so ignore it
        if decision_value > 0:
            decision[index] = 1
        else:
            decision[index] = 0
        decision_value = 0
    return decision

def get_correction_rate (classification, reference):

    currect_sum = 0
    for index, value in enumerate (reference):
        if classification[index] == value:
            currect_sum = currect_sum + 1
    currect_rate = currect_sum/len(reference)
    
    return currect_rate

def get_category_list (input_data):
    data_list = copy.deepcopy(input_data)
    category = []
    for index, value in enumerate (data_list):
        category.append(value['category'])
    return category

def pre_processing_test_data (input_data):
    
    id_num = []
    data_list = copy.deepcopy(input_data)
    
    for index, value in enumerate (data_list):
        id_num.append(value['id'])
        del value['id']
        del value['text']

    return data_list, id_num

### Split the data into train_data (20000) and validation data(5000)：

In [19]:
train_data, validation_data = split_train_validation_data(row_data)

### Get top_words features and insert them into data set：

In [20]:
top_words = get_top_words(train_data, 10000)
train_data = insert_top_words_occurrence (train_data, top_words)
validation_data = insert_top_words_occurrence(validation_data,top_words)

MemoryError: 

### Get number of positive comment and negative comment and then calculate their probability：

In [None]:
num_pos,num_neg = get_num_pos_neg_train_data(train_data)
Pro_pos, Pro_neg = get_probability_pos_neg(num_pos,num_neg)

### Split the train_data into postive data set and negative data set：

In [None]:
train_data_pos,train_data_neg =  split_dataset_to_pos_neg (train_data, num_pos, num_neg)

### Calculate the probabiliy of xj = 1 by giving y = 1 or y = 0:

In [None]:
pro_x1_given_y1 = get_pro_x1_given_y1_or_x1_given_y0(train_data_pos)
pro_x1_given_y0 = get_pro_x1_given_y1_or_x1_given_y0(train_data_neg)

### Generate a list of category (used to comparation):

In [None]:
category_train_data = get_category_list(train_data)
category_validation_data = get_category_list(validation_data)

### Make decision and calculate correction rate on train_data:

In [None]:
for index, value in enumerate (train_data):
    del train_data[index]['category']
    del train_data[index]['text']

decision_train_data = make_decision(train_data)

for index, value in enumerate (validation_data):
    del validation_data[index]['category']
    del validation_data[index]['text']

decision_validation_data = make_decision(validation_data)

In [None]:
currect_rate_train_data = get_correction_rate(decision_train_data, category_train_data)
currect_rate_validation_data = get_correction_rate(decision_validation_data, category_validation_data)

In [None]:
print('currention rate of train_data is :', currect_rate_train_data)
print('currention rate of validation_data is :', currect_rate_validation_data)

### Make prediction on test data:

In [13]:
test_data = insert_top_words_occurrence (test_data, top_words)

In [14]:
test_data,id_num = pre_processing_test_data(test_data)

In [15]:
decision_test_data = make_decision(test_data)