In [1]:
import pandas as pd
import numpy as np

My own classifer

In [2]:
NUM_BINS = 10
SMOOTH = 1

class MyMultinomialNaiveBayes:
    def __init__(self): 
        self.prob_label = {}
        self.prob_feature_under_label = {}
        self.trainX = None
        self.trainY = None
        self.feature_cols = []
        self.label_name = ""
        self.train_step = None
        self.train_min = None
        self.class_labels = []
        
    
    '''
    fit is a training process
    '''
    def fit(self, trainX, trainY):
        # The first way to pick values from columns
        # trainX = train_data[["Feature_1", "Feature_2"]]
        # print(trainX)

        #The second way to pick values using "iloc[]"
        # .iloc[ : , : 6] the first : means select all the rows, :6 means select the columns from 0 to 5.
        # trainX = train_data.iloc[:, :6]
        # now trainX is a dataframe.
        # in pandas, dataframe can use matrix calculation

        # the third way to pick values using "loc[]"
        self.trainX = trainX
        self.trainY = trainY
        self.feature_cols = trainX.columns.tolist()

        # initialization
        # trainX.max()means find the max value for each column in the dataframe,
        # when axis=0, it is finding columns max value, when axis=1, finding each row's max value
        # print(trainX.max(axis=0)) 
        # print(trainX.min())
        # calculate the range, divide the bin numbers and get the steps
        self.train_min = self.trainX.min()
        self.train_step = (self.trainX.max() - self.trainX.min()) / NUM_BINS
        
        # decide which bin the feature data in, // means get integers
        self.trainX = (self.trainX - self.trainX.min()) // self.train_step
        
        
        y_unique_labels = self.trainY.unique()
        #pandas中矩阵取长度的两种方法（取rows的长度）
        #方法一：
        total_num_rows = len(self.trainY)
        #方法二
        # total_num_rows = self.trainX.shape[0]
        
        
        for possible_y in y_unique_labels:
            self.class_labels.append(possible_y)
            #y=0时的所有行构成的矩阵，第二遍取出y=1时所有行构成的矩阵
            df_with_possible_y = self.trainX.loc[self.trainY == possible_y, :] 
            self.prob_label[possible_y] = len(df_with_possible_y)/total_num_rows
            
            for col_name in self.feature_cols:
                # find the frequence of bin number in each column(feature)
                feature_value_counts = df_with_possible_y[col_name].value_counts()
                #print(feature_value_counts)
                # feature_value_counts is now a series has tow columns, sth like a dict
                unique_feature_values = feature_value_counts.index.tolist()

                for i in range(NUM_BINS):
                    float_i = float(i)
                    if float_i not in unique_feature_values:
                        self.prob_feature_under_label[(possible_y, col_name, float_i)] \
                            = SMOOTH/(SMOOTH + len(df_with_possible_y))
                    else:
                        self.prob_feature_under_label[(possible_y, col_name, float_i)] = \
                            (feature_value_counts[float_i] + SMOOTH) / (len(df_with_possible_y) + SMOOTH)
        # print(self.prob_feature_under_label)
        # print(self.prob_label)
            
    
    
    def predict(self, testX):
        df = (testX - self.train_min)//self.train_step
        predict_y = []
        
        for i in range(df.shape[0]):
            row = df.iloc[i, :]
            max_prob = 0
            true_y = 0
            for possible_y in self.class_labels:
                prob_product = 1
                prob_c = self.prob_label[possible_y]
                for col_name in self.feature_cols:
                    if row[col_name] < 0:
                        row[col_name] = 0
                    elif row[col_name] > NUM_BINS-1:
                        row[col_name] = NUM_BINS -1
                    if (possible_y, col_name, row[col_name]) in self.prob_feature_under_label:
                        prob_feature = self.prob_feature_under_label[(possible_y, col_name, row[col_name])]
                        prob_product *= prob_feature
                prob_product *= prob_c
                if prob_product > max_prob:
                    max_prob = prob_product
                    true_y = possible_y
            predict_y.append(true_y)
        return predict_y
            
                

train_data = pd.read_csv("hw1_trainingset.csv")
test_data = pd.read_csv("hw1_testset.csv")
my_nb = MyMultinomialNaiveBayes()
my_nb.fit(train_data.iloc[:, : -1], train_data.iloc[:, -1])
predicted_y = my_nb.predict(test_data)

test_data['Label'] = predicted_y
test_data.to_csv("my_multinomial_naive_bayes_predict_result.csv")

My own crossvalidation

In [3]:
from cross_validation import my_cross_validation
scores = my_cross_validation(MyMultinomialNaiveBayes, train_data.iloc[:, : -1], train_data.iloc[:, -1], 10)
print("The cv scores from my cross validation is " + str(scores) + " and the average is " 
      + str(sum(scores) / len(scores)) )


The cv scores from my cross validation is [0.36950146627565983, 0.3861671469740634, 0.37362637362637363, 0.32294617563739375, 0.40816326530612246, 0.3592814371257485, 0.37777777777777777, 0.3575418994413408, 0.31671554252199413, 0.39759036144578314] and the average is 0.3669311446132258


Using sklean

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score

train_data = pd.read_csv("hw1_trainingset.csv")
test_data = pd.read_csv("hw1_testset.csv")

train_data = train_data.sample(frac=1, random_state=42)
trainX = train_data.iloc[:, : -1]
trainY = train_data.iloc[:, -1]

train_min = trainX.min()
train_step = (trainX.max() - trainX.min()) / NUM_BINS
trainX = (trainX - train_min) / train_step  
        
clf = MultinomialNB()
clf.fit(trainX, trainY)

testX = (test_data - train_min) / train_step  

predicted_y = clf.predict(testX)
test_data['Label'] = predicted_y
test_data.to_csv("sklearn_multinomial_naive_bayes_predict_result.csv")

scores = cross_val_score(MultinomialNB(), trainX, trainY,
                scoring="f1",
                cv=10)

print("The cv scores from sklearn is " + str(scores) + " and the average is " 
      + str(sum(scores) / len(scores)) )

The cv scores from sklearn is [0.33819242 0.29012346 0.34582133 0.34202899 0.33898305 0.3625731
 0.32748538 0.38692098 0.34005764 0.3253012 ] and the average is 0.3397487540783795
