Logistic Regression 
(data filtered out into 4 categories: bold & italic, bold & nonitalic, nonbold & italic, nonbold & nonitalic)

In [1]:
import numpy as np
import pandas
from sklearn import linear_model
from sklearn import metrics
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression

In [2]:
Trainfile = pandas.read_csv('train_data.csv')
Trainlabels = pandas.read_csv('train_labels.csv')
Testfile = pandas.read_csv('test_data.csv')
X_train = Trainfile.values
Y_train = Trainlabels.values
X_test = Testfile.values

In [3]:
Y_train.shape

(65000, 1)

In [4]:
X_train.shape

(65000, 407)

In [5]:
def train_and_test(model, filename, print=False):
    model.fit(X_train, Y_train.flatten())
    if print:
        print('training accuracy:', accuracy_score(model.predict(X_train), Y_train.flatten()))
    Y_test_pred = model.predict(X_test).astype(object)
    np.savetxt(filename, np.dstack((np.arange(1, Y_test_pred.size+1),Y_test_pred))[0],"%d,%s",header="ID,Font",comments="")

In [6]:
# get examples (x,y) that are bold and italic and its indices
def filter_boldanditalic(x,y=None):
    i = np.where((x[:,1] == 0.7) & (x[:,2] == 1.))
    return (x[i], y[i], i) if y is not None else (x[i], i)

# get examples (x,y) that are bold and nonitalic and its indices
def filter_boldandnonitalic(x,y=None):
    i = np.where((x[:,1] == 0.7) & (x[:,2] == 0.))
    return (x[i], y[i], i) if y is not None else (x[i], i)

# get examples (x,y) that are nonbold and italic and its indices
def filter_nonboldanditalic(x,y=None):
    i = np.where((x[:,1] == 0.4) & (x[:,2] == 1.))
    return (x[i], y[i], i) if y is not None else (x[i], i)

# get examples (x,y) that are nonbold and nonitalic and its indices
def filter_nonboldandnonitalic(x,y=None):
    i = np.where((x[:,1] == 0.4) & (x[:,2] == 0.))
    return (x[i], y[i], i) if y is not None else (x[i], i)

In [7]:
# filter train data into 4 cases: bold,italic=[(1,1), (0,1), (1,0), (0,0)]
X_train_bolditalic, Y_train_bolditalic, _ = filter_boldanditalic(X_train, Y_train)
X_train_boldnonitalic, Y_train_boldnonitalic, _ = filter_boldandnonitalic(X_train, Y_train)
X_train_nonbolditalic, Y_train_nonbolditalic, _ = filter_nonboldanditalic(X_train, Y_train)
X_train_nonboldnonitalic, Y_train_nonboldnonitalic, _ = filter_nonboldandnonitalic(X_train, Y_train)

# filter test data into 4 cases: bold,italic=[(1,1), (0,1), (1,0), (0,0)]
X_test_bolditalic, bi = filter_boldanditalic(X_test)
X_test_boldnonitalic, bni = filter_boldandnonitalic(X_test)
X_test_nonbolditalic, nbi = filter_nonboldanditalic(X_test)
X_test_nonboldnonitalic, nbni = filter_nonboldandnonitalic(X_test)

In [8]:
# standard normalization
def normalize_std(X):
    return (X - np.mean(X_train, axis=0)) / np.std(X_train, axis=0)

X_train_bolditalic = normalize_std(X_train_bolditalic)
X_train_boldnonitalic = normalize_std(X_train_boldnonitalic)
X_train_nonbolditalic = normalize_std(X_train_nonbolditalic)
X_train_nonboldnonitalic = normalize_std(X_train_nonboldnonitalic)

X_test_bolditalic = normalize_std(X_test_bolditalic)
X_test_boldnonitalic = normalize_std(X_test_boldnonitalic)
X_test_nonbolditalic = normalize_std(X_test_nonbolditalic)
X_test_nonboldnonitalic = normalize_std(X_test_nonboldnonitalic)

In [15]:
# 1. bold and italic
lr = linear_model.LogisticRegression(multi_class='multinomial', max_iter=4000)
lr.fit(X_train_bolditalic[:,3:], Y_train_bolditalic.flatten())
acc_a = metrics.accuracy_score(Y_train_bolditalic.flatten(), lr.predict(X_train_bolditalic[:,3:])) 
print('bold and italic train accuracy:', acc_a) 
Y_test_bi = lr.predict(X_test_bolditalic[:,3:])

# 2. bold and non-italic
lr = linear_model.LogisticRegression(multi_class='multinomial', max_iter=4000)
lr.fit(X_train_boldnonitalic[:,3:], Y_train_boldnonitalic.flatten())
acc_b = metrics.accuracy_score(Y_train_boldnonitalic.flatten(), lr.predict(X_train_boldnonitalic[:,3:])) 
print('bold and nonitalic train accuracy:', acc_b)  
Y_test_bni = lr.predict(X_test_boldnonitalic[:,3:])

# 3. non-bold and italic
lr = linear_model.LogisticRegression(multi_class='multinomial', max_iter=4000)
lr.fit(X_train_nonbolditalic[:,3:], Y_train_nonbolditalic.flatten())
acc_c = metrics.accuracy_score(Y_train_nonbolditalic.flatten(), lr.predict(X_train_nonbolditalic[:,3:])) 
print('nonbold and italic train accuracy:', acc_c)
Y_test_nbi = lr.predict(X_test_nonbolditalic[:,3:])

# 4. non-bold and non-italic
lr = linear_model.LogisticRegression(multi_class='multinomial', max_iter=4000)
lr.fit(X_train_nonboldnonitalic[:,3:], Y_train_nonboldnonitalic.flatten())
acc_d = metrics.accuracy_score(Y_train_nonboldnonitalic.flatten(), lr.predict(X_train_nonboldnonitalic[:,3:]))
print('nonbold and nonitalic train accuracy:', acc_d) 
Y_test_nbni = lr.predict(X_test_nonboldnonitalic[:,3:])

# TOTAL TRAINING ACCURACY: 58.09692307692308%
num_correct_exs = acc_a*len(Y_train_bolditalic) + acc_b*len(Y_train_boldnonitalic) + acc_c*len(Y_train_nonbolditalic) + acc_d*len(Y_train_nonboldnonitalic)
print('\033[1mLogistic Regression training accuracy:', num_correct_exs / len(Y_train))

bold and italic train accuracy: 0.6125482766159003
bold and nonitalic train accuracy: 0.508389375721758
nonbold and italic train accuracy: 0.608834873735536
nonbold and nonitalic train accuracy: 0.5920227920227921
[1mLogistic Regression training accuracy: 0.5809692307692308


In [16]:
i_bi = 0
i_nbi = 0
i_bni = 0
i_nbni = 0

Y_test_pred = []

for i in range(len(X_test)):
    if i in bi[0]:
        Y_test_pred.append(Y_test_bi[i_bi])
        i_bi+=1
    elif i in bni[0]:
        Y_test_pred.append(Y_test_bni[i_bni])
        i_bni+=1
    elif i in nbni[0]:
        Y_test_pred.append(Y_test_nbni[i_nbni])
        i_nbni+=1
    else:
        Y_test_pred.append(Y_test_nbi[i_nbi])
        i_nbi+=1
    i+=1

# write to file! 
Y_test_pred = np.array(Y_test_pred).astype(object)
np.savetxt('lr.csv', np.dstack((np.arange(1, Y_test_pred.size+1),Y_test_pred))[0],"%d,%s",header="ID,Font",comments="")

# TEST ACCURACY: 52.553%

In [5]:
X_test.shape

(29221, 407)