In [1]:
import pandas as pd
import numpy as np
from scipy import optimize 

In [2]:
def create_dict_list(dict_dict): # on self.df
    newDict = dict.fromkeys(dict_dict.keys())
    for i in dict_dict.keys():
        newDict[i] = list(dict_dict[i].values())
    return newDict

In [3]:
# select variables ! put as a method 
def get_DictX(full_dataframe, Y): # e.g. create_dict_list(dataset_train.df)
    
    # temporary fix: can put the variables as arguments
    DictX = full_dataframe.copy()
    del DictX['First Name']
    del DictX['Last Name']
    del DictX['Birthday']
    del DictX['Index'] 
    del DictX[Y]
    return DictX

def get_dummies(full_dataframe, variable):
    Dict = {variable:full_dataframe[variable]}.copy()
    Categories = list(set(Dict[variable])) 
    for i in Categories: 
        Dict[i] = [(element == i)*1 for element in Dict[variable]]
    del Dict[variable]
    return Dict

def get_Y(full_dict, Y):
    return np.array(full_dict[Y].copy())

In [4]:
# for a given feature 
def one_hot_encoder(dataframe, column_to_encode) : 
    Encoded_Dict = dataframe.copy()
    NewCategories = list(set(Encoded_Dict[column_to_encode]))
    NewCategories.pop() # remove the last element 
    for i in NewCategories: 
        Encoded_Dict[i] = [(element == i)*1 for element in Encoded_Dict[column_to_encode]]
    del Encoded_Dict[column_to_encode]
    return Encoded_Dict

# for every feature 
def full_one_hot_encoder(dataframe) : # dataframe = NewDict
    keys_str = [keys for keys in dataframe.keys() if type(list(dataframe[keys])[0]) == str]
    Full_Dict = dataframe.copy()
    for key in keys_str:
        Full_Dict = one_hot_encoder(Full_Dict, key)
    return np.column_stack(list(Full_Dict.values())) # return the dictionary as a matrix for the LogReg

# Getting X and Y 

In [5]:
from louisdataset import MyDataSet

name_Y = 'Hogwarts House'
name_subY = 'Gryffindor'


dataset_train = MyDataSet().read_csv('resources/dataset_train.csv')
dataset_train_dict = dataset_train.dict_list()

# getting X
DictX = get_DictX(dataset_train_dict, name_Y)
X = full_one_hot_encoder(DictX)
# getting Y
SubDict = get_dummies(dataset_train_dict, name_Y)
Y = get_Y(SubDict, name_subY)

#print(X)
#print(Y)

In [6]:
print(Y.shape)
print(X.shape)

(1600,)
(1600, 14)


# Loss Function

In [7]:
def neg_loglikelihood(beta, Y, X):
    # sum without NAs
    return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))

In [43]:
def train_binary(Y_train, X_train):
    m, p = X_train.shape
    intercept = np.ones(m)
    X_one = np.column_stack((intercept,X_train))
    n, d = X_one.shape
    init_w = np.zeros(d)
    res = optimize.minimize(neg_loglikelihood,init_w, method = 'BFGS', args = (Y_train,X_one))
    return res.x

In [44]:
train_binary(Y, X)

  This is separate from the ipykernel package so we can avoid doing imports until
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  This is separate from the ipykernel package so we can avoid doing imports until


array([-9.06761533e-03, -6.73562112e-04,  1.94258102e-02,  2.10183915e+00,
       -3.14361858e-02, -1.80814430e+00, -2.46670317e-02,  2.71601098e-01,
       -2.48633049e+00, -1.82562228e-01, -1.67489096e+00,  8.22004827e-02,
       -4.31557973e-01, -3.70733014e-01,  8.27656049e-01])

# Further Test 

In [46]:
import numpy as np
from scipy import optimize 

from louisdataset import MyDataSet
from louis_get_matrix import get_DictX, get_dummies, get_Y, full_one_hot_encoder
from louismodel import LogRegModel

name_Y = 'Hogwarts House'
name_subY = 'Gryffindor'

dataset_train = MyDataSet().read_csv('resources/dataset_train.csv')
dataset_train_dict = dataset_train.dict_list()

# getting X
DictX = get_DictX(dataset_train_dict, name_Y)
X = full_one_hot_encoder(DictX)
# getting Y
SubDict = get_dummies(dataset_train_dict, name_Y)
Y = get_Y(SubDict, name_subY)

model = LogRegModel()
log_params = model.fit_binary(Y_train = Y, X_train = X)
print(log_params)

  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))


[-9.03570213e-03 -6.73554860e-04  1.94265234e-02  2.10181886e+00
 -3.13406120e-02 -1.80811512e+00 -2.46667569e-02  2.71598131e-01
 -2.48631069e+00 -1.82561077e-01 -1.67486543e+00  8.22003830e-02
 -4.31556335e-01 -3.70729114e-01  8.27654662e-01]


# Multinomial stuff

In [51]:
dataset_train = MyDataSet().read_csv('resources/dataset_train.csv')
dataset_train_dict = dataset_train.dict_list()
Y_full = dataset_train_dict['Hogwarts House']
name_Y = 'Hogwarts House'

In [63]:
Y_full = {'Hogwarts House':dataset_train_dict['Hogwarts House']}.copy()
Y_full.keys()

dict_keys(['Hogwarts House'])

In [57]:
SubDict = get_dummies(dataset_train_dict, name_Y)
Categories = list(set(Y_full))
for i in Categories: 
    Y_train = get_Y(SubDict, i)
    LogRegModel().fit_binary(Y_train, X)

  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))
  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))
  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))
  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))
  grad[k] = (f(*((xk + d,) + args)) - f0) / d[k]
  return -np.nansum(Y*np.matmul(X,beta) - np.log(1+np.exp(np.matmul(X,beta))))


In [59]:
LogRegModel().coef()

AttributeError: 'LogRegModel' object has no attribute 'coef'

# IMPORT A SAMPLE TEST 

In [49]:
datatest = pd.read_csv('resources/dataset_train.csv')
datatest

Unnamed: 0,Index,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.727180,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.107170,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.140,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42
3,3,Gryffindor,Vesta,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.014040,-256.84675,200.64
4,4,Gryffindor,Gaston,Gibbs,1998-09-27,Left,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.264070,-256.38730,157.98
5,5,Slytherin,Corrine,Hammond,1999-04-04,Right,21209.0,-613.687160,-4.289197,6.136872,-6.592,-440.997704,396.201804,5.380286,1052.845164,11.751212,1.049894,-247.94549,-34.69
6,6,Gryffindor,Tom,Guido,2000-09-30,Left,49167.0,628.046051,-4.861976,-6.280461,,-926.892512,583.742442,-7.322486,923.539573,1.646666,0.153022,-257.83447,261.55
7,7,Hufflepuff,Alicia,Hayward,1997-07-08,Right,33010.0,411.412727,5.931832,-4.114127,2.769,-502.021336,439.351416,,1041.091935,6.581791,-0.171704,-244.03492,72.25
8,8,Gryffindor,Bella,Leatherman,1998-12-07,Left,20278.0,496.394945,-5.215891,-4.963949,5.855,-626.552041,567.842402,-6.198661,925.255500,1.086518,1.147032,-252.27561,244.11
9,9,Hufflepuff,Hayden,Aponte,2001-10-13,Right,46316.0,527.193585,7.922205,-5.271936,3.356,-398.101991,341.475606,4.978614,1041.414665,2.068824,-0.529579,-244.57527,-0.09
