In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sys import argv

In [2]:
def describe(df):
    """
    Program that describe a dataset.
    """
    dfNum = df.select_dtypes(include=['int64','float64'])
    stats = pd.DataFrame(columns=dfNum.columns)
    col_name=stats.columns
    for i in range (len(stats.columns)):
        col = df.loc[:, col_name[i]]
        col_allNum = df.loc[~col.isna(), col_name[i]]
        count = len(col_allNum)
        mean = sum(col_allNum) / len(col_allNum)
        var =  sum([(x - mean)**2 for x in col_allNum])/count
        std = var**(0.5)
        stats.loc['Mean', col_name[i]] = mean
        stats.loc['Std', col_name[i]] = std
    return stats

In [3]:
def normalize_value(df):
    dfNum = df.select_dtypes(include=['int64','float64'])
    stats = describe(dfNum)
    dfN = df.copy()
    for i in range(len(dfNum.columns)):
        mean = stats.loc["Mean", dfNum.columns[i]]
        std = stats.loc["Std", dfNum.columns[i]]
        dfN.loc[:, dfNum.columns[i]] = (dfN.loc[:, dfNum.columns[i]] - mean) / std
    return dfN


In [4]:
def computeCost(X, y, W, b, lambda_ = 1):
    """
    X (ndarray (m,n): Data, m examples with n features
    y (ndarray (m,)): target values
    w (ndarray (n,)): model parameters
    b (scalar)      : model parameter
    lambda_ (scalar): Controls amount of regularization
    Returns:
        cost (scalar):  cost
    """
    m, n = X.shape
    cost = 0
    for i in range(m):
        z = np.dot(W, X.iloc[i].values) + b
        f_wb = 1 / (1 + np.exp(-z))
        cost += -y.iloc[i].item() * np.log(f_wb) - (1 - y.iloc[i].item())*np.log(1-f_wb)
    cost = cost / m
    reg_part = 0
    for i in range(n):
        reg_part += W[i]**2
    cost = cost + (lambda_ / (2*m)) * reg_part
    return cost

In [5]:
def updateWb(X, y_slytherin, y_ravenclaw, y_gryffindor, y_hufflepuff, w, b, lambda_, alpha):
    m, n = X.shape
    dj_dw = pd.DataFrame(columns=X.columns,
                     index=["Slytherin", "Ravenclaw", "Gryffindor", "Hufflepuff"], data=0.0)
    dj_db = pd.DataFrame(columns=["b"],
                     index=["Slytherin", "Ravenclaw", "Gryffindor", "Hufflepuff"], data=0.0)
    for i in range(m):
        z_slytherin = np.dot(X.iloc[i].values, w.loc["Slytherin", :].values) + b.loc["Slytherin", "b"].item()
        f_wb_i_slytherin = 1 / (1 + np.exp(-z_slytherin))
        err_i_slytherin  = f_wb_i_slytherin  - y_slytherin.iloc[i].item()

        z_ravenclaw = np.dot(X.iloc[i].values, w.loc["Ravenclaw", :].values) + b.loc["Ravenclaw", "b"].item()
        f_wb_i_ravenclaw = 1 / (1 + np.exp(-z_ravenclaw))
        err_i_ravenclaw  = f_wb_i_ravenclaw  - y_ravenclaw.iloc[i].item()
        
        z_gryffindor = np.dot(X.iloc[i].values, w.loc["Gryffindor", :].values) + b.loc["Gryffindor", "b"].item()
        f_wb_i_gryffindor = 1 / (1 + np.exp(-z_gryffindor))
        err_i_gryffindor  = f_wb_i_gryffindor  - y_gryffindor.iloc[i].item()

        z_hufflepuff = np.dot(X.iloc[i].values, w.loc["Hufflepuff", :].values) + b.loc["Hufflepuff", "b"].item()
        f_wb_i_hufflepuff = 1 / (1 + np.exp(-z_hufflepuff))
        err_i_hufflepuff = f_wb_i_hufflepuff  - y_hufflepuff.iloc[i].item()

        for j in range(n):
            col_name = X.columns[j]
            dj_dw.loc["Slytherin", col_name] = dj_dw.loc["Slytherin", col_name].item() + err_i_slytherin * X.iloc[i,j].item()
            dj_dw.loc["Ravenclaw", col_name] = dj_dw.loc["Ravenclaw", col_name].item() + err_i_ravenclaw * X.iloc[i,j].item()
            dj_dw.loc["Gryffindor", col_name] = dj_dw.loc["Gryffindor", col_name].item() + err_i_gryffindor * X.iloc[i,j].item()
            dj_dw.loc["Hufflepuff", col_name] = dj_dw.loc["Hufflepuff", col_name].item() + err_i_hufflepuff * X.iloc[i,j].item()
        
        dj_db.loc["Slytherin", "b"] = dj_db.loc["Slytherin", "b"].item() + err_i_slytherin
        dj_db.loc["Ravenclaw", "b"] = dj_db.loc["Ravenclaw", "b"].item() + err_i_ravenclaw
        dj_db.loc["Gryffindor", "b"] = dj_db.loc["Gryffindor", "b"].item() + err_i_gryffindor
        dj_db.loc["Hufflepuff", "b"] = dj_db.loc["Hufflepuff", "b"].item() + err_i_hufflepuff
        
    dj_dw =  dj_dw/m
    dj_db = dj_db/m

    for j in range(n):
        col_name = X.columns[j]
        dj_dw.loc["Slytherin", col_name] = dj_dw.loc["Slytherin", col_name].item() + (lambda_/m) * w.loc["Slytherin", col_name].item()
        dj_dw.loc["Ravenclaw", col_name] = dj_dw.loc["Ravenclaw", col_name].item() + (lambda_/m) * w.loc["Ravenclaw", col_name].item()
        dj_dw.loc["Gryffindor", col_name] = dj_dw.loc["Gryffindor", col_name].item() + (lambda_/m) * w.loc["Gryffindor", col_name].item()
        dj_dw.loc["Hufflepuff", col_name] = dj_dw.loc["Hufflepuff", col_name].item() + (lambda_/m) * w.loc["Hufflepuff", col_name].item()
    
    W_updated = w - alpha * dj_dw
    b_updated = b - alpha * dj_db
    return (W_updated, b_updated)

In [6]:
def executeGradientDescentAlgo(X, y_slytherin, y_ravenclaw, y_gryffindor, y_hufflepuff, alpha, lambda_, nb_iterations):
    w = pd.DataFrame(columns=X.columns,
                     index=["Slytherin", "Ravenclaw", "Gryffindor", "Hufflepuff"], data=0)
    b = pd.DataFrame(columns=["b"],
                     index=["Slytherin", "Ravenclaw", "Gryffindor", "Hufflepuff"], data=0)
    result = pd.DataFrame(columns=["Slytherin", "Ravenclaw", "Gryffindor", "Hufflepuff"])
    for i in tqdm(range(nb_iterations)):
        w, b = updateWb(X, y_slytherin,y_ravenclaw, y_gryffindor, y_hufflepuff, w, b, lambda_, alpha)
        # result.append(computeCost(X, y, w, b, lambda_))
    return(w, b)

In [7]:
def logreg(df):
    # On supprime les valeurs NaN
    for i in range(len(df.columns)):
        df = df.loc[~df.isna()[df.columns[i]], :]

    # On ajoute les test
    slytherin_df = df.copy()
    slytherin_df.loc[:, "Test Slytherin"] = 0
    slytherin_df.loc[df.loc[:,"Hogwarts House"] == "Slytherin", "Test Slytherin"] = 1
    ravenclaw_df= df.copy()
    ravenclaw_df.loc[:, "Test Ravenclaw"] = 0
    ravenclaw_df.loc[df.loc[:,"Hogwarts House"] == "Ravenclaw", "Test Ravenclaw"] = 1
    gryffindor_df= df.copy()
    gryffindor_df.loc[:, "Test Gryffindor"] = 0
    gryffindor_df.loc[df.loc[:,"Hogwarts House"] == "Gryffindor", "Test Gryffindor"] = 1
    hufflepuff_df= df.copy()
    hufflepuff_df.loc[:, "Test Hufflepuff"] = 0
    hufflepuff_df.loc[df.loc[:,"Hogwarts House"] == "Hufflepuff", "Test Hufflepuff"] = 1
    slytherin_df.drop("Hogwarts House", axis=1, inplace=True)
    ravenclaw_df.drop("Hogwarts House", axis=1, inplace=True)
    gryffindor_df.drop("Hogwarts House", axis=1, inplace=True)
    hufflepuff_df.drop("Hogwarts House", axis=1, inplace=True)

    X = slytherin_df.iloc[:, :len(slytherin_df.columns)-1]
    y_slytherin = slytherin_df.iloc[:, len(slytherin_df.columns)-1:]
    y_ravenclaw = ravenclaw_df.iloc[:, len(ravenclaw_df.columns)-1:]
    y_gryffindor = gryffindor_df.iloc[:, len(gryffindor_df.columns)-1:]
    y_hufflepuff = hufflepuff_df.iloc[:, len(hufflepuff_df.columns)-1:]
    alpha = 0.3
    lambda_ = 1
    nb_iterations = 150
    return (executeGradientDescentAlgo(X, y_slytherin, y_ravenclaw, y_gryffindor, y_hufflepuff, alpha, lambda_, nb_iterations))

In [8]:
df = pd.read_csv("../datasets/dataset_train.csv", index_col = "Index")

In [9]:
df_bis = df.drop(['First Name', 'Last Name', "Birthday", "Best Hand"], axis=1, inplace=False)

In [10]:
stats = describe(df_bis)

In [11]:
stats

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
Mean,49634.570243,39.797131,1.14102,-0.387863,3.15391,-224.589915,495.74797,2.963095,1030.096946,5.950373,-0.053427,-243.374409,21.958012
Std,16674.479577,520.13233,5.218016,5.211132,4.15397,486.189433,106.251202,4.424353,44.111025,3.146852,0.971146,8.780895,97.601087


In [12]:
df_Normilised = normalize_value(df_bis)

In [13]:
df_Normilised.drop("Astronomy", axis=1, inplace=True)

In [14]:
df_Normilised

Unnamed: 0_level_0,Hogwarts House,Arithmancy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,Ravenclaw,0.524720,0.878909,1.010668,0.377492,1.021466,0.345749,0.512609,0.219703,-0.686402,0.792225,1.204930,-0.500486
1,Slytherin,1.055771,-1.366126,1.133817,-2.110249,-0.540429,-1.204576,0.258586,0.653978,0.412593,0.149413,-1.003297,-1.387362
2,Ravenclaw,-1.555225,1.261782,0.776918,0.718852,1.829500,1.005516,0.133914,1.314669,0.882837,-0.475624,1.825755,0.086700
3,Gryffindor,-1.017577,-1.463820,-1.264517,0.209941,-0.642571,0.265730,-1.756806,-2.487031,-1.629712,0.040557,-1.534279,1.830738
4,Gryffindor,0.631110,-1.717442,,-0.220972,-0.451825,0.974827,-1.448228,-2.100659,-0.520936,-0.216902,-1.481955,1.393652
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1595,Gryffindor,-0.037517,-1.089084,-0.605423,0.613411,-0.560780,1.152667,-1.852207,-1.493438,-0.813920,-0.614281,-0.799418,1.678998
1596,Slytherin,0.819302,0.942896,-0.630851,-0.336283,-0.861148,-0.469834,-0.163862,0.590565,-0.039757,-0.288870,-0.347662,0.234034
1597,Gryffindor,0.855825,-0.832556,-0.969526,0.700797,-0.330243,1.312583,-2.022296,-1.728145,-1.347558,0.128710,-0.940950,1.808504
1598,Hufflepuff,1.983776,0.441128,-0.796161,0.862811,-1.248796,-1.056958,0.192122,1.311515,-0.650260,-0.492663,-0.320732,-1.011956


In [15]:
X, y_slytherin, y_ravenclaw, y_gryffindor, y_hufflepuff,alpha, lambda_, nb_iterations = logreg(df_Normilised.copy())

  1%|          | 1/150 [00:04<11:30,  4.63s/it]


KeyboardInterrupt: 

In [16]:
y_slytherin

NameError: name 'y_slytherin' is not defined

In [18]:
y_gryffindor.to_numpy()

NameError: name 'y_gryffindor' is not defined

In [None]:
w = pd.DataFrame(columns=X.columns,
                    index=["Slytherin", "Ravenclaw", "Gryffindor", "Hufflepuff"], data=0.0)
b = pd.DataFrame(columns=["b"],
                    index=["Slytherin", "Ravenclaw", "Gryffindor", "Hufflepuff"], data=0.0)
result = pd.DataFrame(columns=["Slytherin", "Ravenclaw", "Gryffindor", "Hufflepuff"])


In [None]:
result

Unnamed: 0,Slytherin,Ravenclaw,Gryffindor,Hufflepuff


In [None]:
# for i in tqdm(range(nb_iterations)):

w, b = updateWb(X, y_slytherin,y_ravenclaw, y_gryffindor, y_hufflepuff, w, b, lambda_, alpha)

In [None]:
b

Unnamed: 0,b
Slytherin,-0.095967
Ravenclaw,-0.065897
Gryffindor,-0.088215
Hufflepuff,-0.049922


In [None]:
test = [1 , 2, 3, 4]

In [None]:
a, b, c, d = test

In [None]:
a

1

In [None]:
b

2

In [None]:
c

3

In [None]:
d

4

In [19]:
n = 12
dj_dw = [np.zeros(n), np.zeros(n), np.zeros(n), np.zeros(n)]

In [20]:
dj_dw

[array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]

In [25]:
dj_dw[0][2] = 5

In [26]:
dj_dw

[array([0., 0., 5., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])]