In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [23]:
y

date
2000-01-01    nonevent
2000-01-20    nonevent
2000-01-23    nonevent
2000-02-17    nonevent
2000-03-25          Ib
                ...   
2009-08-07    nonevent
2009-08-24          II
2009-08-26    nonevent
2009-09-09          II
2009-09-13    nonevent
Name: class4, Length: 360, dtype: category
Categories (4, object): ['II', 'Ia', 'Ib', 'nonevent']

In [20]:
# import and modify data
npf_train = pd.read_csv("npf_train.csv")
npf_train = npf_train.drop("partlybad",axis=1)

npf_train = npf_train.set_index("date")
## Tell that "class4" is categorical variable. (R does this automatically.)
npf_train["class4"] = npf_train["class4"].astype("category")

## Here date column was converted to index and we do not need to get rid of it.
npf_train = npf_train.drop("id",axis=1)

## If you don't use dtype="object" array will cut strings...
class2 = np.array(["event"]*npf_train.shape[0],dtype="object")
class2[npf_train["class4"]=="nonevent"] = "nonevent"
npf_train["class2"] = class2
npf_train["class2"] = npf_train["class2"].astype("category")


# reformating X to numerics
for i in range(1,101): # 101 because we have 102 columns
    npf_train.iloc[:,i] = pd.to_numeric(list(npf_train.iloc[:,i]))

## REDUCING the set to get a training and a test set (360 ~ 80% of 458)
X = npf_train.iloc[:360, 1:-1]
y = npf_train.iloc[:360, 0]

# our test set:
X_test = npf_train.iloc[360:, 1:-1]
y_test = npf_train.iloc[360:, 0]

In [24]:
# train logistic regression model (a basic one for reference)
from sklearn.linear_model import LogisticRegression

model_lr = LogisticRegression()
t_model_lr = model_lr.fit(X, y)

In [25]:
# computing the score
score_lr = t_model_lr.score(X_test, y_test)
score_lr



0.8061224489795918

In [26]:
# retrain with higher max_iter & computing score
model_lr_higher_maxiter = LogisticRegression(max_iter= 10000)
t_model_lr_higher_maxiter = model_lr_higher_maxiter.fit(X,y)
t_model_lr_higher_maxiter_score = t_model_lr_higher_maxiter.score(X_test, y_test) 
t_model_lr_higher_maxiter_score

0.7244897959183674

In [27]:
# creating a function outputting the results of the model
def pred_proba_class(t_model, X_test_data, y_test_data): # t_model is the trained model
    df = pd.DataFrame()
    df["X"] = [X_test_data.iloc[i, :] for i in range(len(X_test_data))]
    df["y_true"] = np.array(y_test_data)
    df["y_pred_class"] = t_model.predict(X_test_data)
    proba_event= t_model.predict_proba(X_test_data)[:,0]
    df["y_pred_proba"] = proba_event
    df["correct?"] = [df.iloc[i,1] == df.iloc[i,2] for i in range(len(df))]
    
    return df

pred_proba_class(t_model_lr_higher_maxiter, X_test, y_test)

Unnamed: 0,X,y_true,y_pred_class,y_pred_proba,correct?
0,CO2168.mean 375.647561 CO2168.std 3.2...,II,II,0.667629,True
1,CO2168.mean 382.823235 CO2168.std 3.8...,II,II,0.580668,True
2,CO2168.mean 384.067500 CO2168.std 2.5...,Ib,Ia,0.122637,False
3,CO2168.mean 384.472708 CO2168.std 1.9...,II,Ib,0.149901,False
4,CO2168.mean 390.631489 CO2168.std 0.8...,nonevent,nonevent,0.028781,True
...,...,...,...,...,...
93,CO2168.mean 377.541538 CO2168.std 6.3...,nonevent,nonevent,0.161142,True
94,CO2168.mean 381.016623 CO2168.std 4.4...,nonevent,nonevent,0.000126,True
95,CO2168.mean 386.687895 CO2168.std 12.0...,nonevent,nonevent,0.001029,True
96,CO2168.mean 379.279128 CO2168.std 12.0...,nonevent,nonevent,0.006024,True


In [28]:
# modifying the logistic regression trying to improve the score:

# cross validation for evaluation 




###### predictor selection
# lbfgs and liblinear are not to be used since we want to be able to extend our model to class4 prediction

# Ridge Regression = penalty l2 --> default
model_lr_rr_newton = LogisticRegression(penalty = "l2", solver = 'newton-cg')
t_model_lr_rr_newton = model_lr_rr_newton.fit(X,y)
score_lr_rr_newton = t_model_lr_rr_newton.score(X_test, y_test)

model_lr_rr_sag = LogisticRegression(penalty = "l2", solver = 'sag')
t_model_lr_rr_sag = model_lr_rr_sag.fit(X,y)
score_lr_rr_sag = t_model_lr_rr_sag.score(X_test, y_test)

model_lr_rr_saga = LogisticRegression(penalty = "l2", solver = 'saga')
t_model_lr_rr_saga = model_lr_rr_saga.fit(X,y)
score_lr_rr_saga = t_model_lr_rr_saga.score(X_test, y_test)

# Lasso = penalty l1 
model_lr_lasso = LogisticRegression(penalty = "l1", solver = 'saga')# this solver bc it is the only option for multiclass & l1
t_model_lr_lasso = model_lr_lasso.fit(X,y)
score_lr_lasso = t_model_lr_lasso.score(X_test, y_test)

# none= penalty none
model_lr_none_newton = LogisticRegression(penalty = "none", solver = 'newton-cg')
t_model_lr_none_newton = model_lr_none_newton.fit(X,y)
score_lr_none_newton = t_model_lr_none_newton.score(X_test, y_test)

model_lr_none_sag = LogisticRegression(penalty = "none", solver = 'sag')
t_model_lr_none_sag = model_lr_none_sag.fit(X,y)
score_lr_none_sag = t_model_lr_none_sag.score(X_test, y_test)

model_lr_none_saga = LogisticRegression(penalty = "none", solver = 'saga')
t_model_lr_none_saga = model_lr_none_saga.fit(X,y)
score_lr_none_saga = t_model_lr_none_saga.score(X_test, y_test)

# Lasso + Ridge = ??? = penalty elasticnet
model_lr_elastic = LogisticRegression(penalty = "elasticnet", solver = 'saga', l1_ratio = 0.5) # this solver bc it's the only option, l1_ratio just in the middle for now
t_model_lr_elastic = model_lr_elastic.fit(X,y)
score_lr_elastic = t_model_lr_elastic.score(X_test, y_test)


# parameter tuning



# df for modifications, t_mode and score
mms = pd.DataFrame()
mms["modifications"] = ['RR newton', 'RR sag', 'RR saga', 'Lasso saga', 'none newton', 'none sag', 'none saga', 'Lasso+RR saga']
mms["t_models"] = [t_model_lr_rr_newton, t_model_lr_rr_sag, t_model_lr_rr_saga, t_model_lr_lasso, t_model_lr_none_newton, t_model_lr_none_sag, t_model_lr_none_saga, t_model_lr_elastic]
mms["simple scores"] = [score_lr_rr_newton, score_lr_rr_sag, score_lr_rr_saga, score_lr_lasso, score_lr_none_newton, score_lr_none_sag, score_lr_none_saga, score_lr_elastic]
mms

Unnamed: 0,modifications,t_models,simple scores
0,RR newton,LogisticRegression(solver='newton-cg'),0.72449
1,RR sag,LogisticRegression(solver='sag'),0.77551
2,RR saga,LogisticRegression(solver='saga'),0.785714
3,Lasso saga,"LogisticRegression(penalty='l1', solver='saga')",0.785714
4,none newton,"LogisticRegression(penalty='none', solver='new...",0.632653
5,none sag,"LogisticRegression(penalty='none', solver='sag')",0.77551
6,none saga,"LogisticRegression(penalty='none', solver='saga')",0.785714
7,Lasso+RR saga,"LogisticRegression(l1_ratio=0.5, penalty='elas...",0.785714


In [29]:
# same thing with maxiter = 1000 --> since there was a worning ...  to see if we get an improvement
# tried with max_iter = 1000 and = 10000 (gave warnings but terminated)
# --> results with 1000 ha better highest value thus here chosen:

# Ridge Regression = penalty l2 --> default
model_lr_rr_newton = LogisticRegression(penalty = "l2", solver = 'newton-cg', max_iter = 1000)
t_model_lr_rr_newton = model_lr_rr_newton.fit(X,y)
score_lr_rr_newton = t_model_lr_rr_newton.score(X_test, y_test)

model_lr_rr_sag = LogisticRegression(penalty = "l2", solver = 'sag',  max_iter = 1000)
t_model_lr_rr_sag = model_lr_rr_sag.fit(X,y)
score_lr_rr_sag = t_model_lr_rr_sag.score(X_test, y_test)

model_lr_rr_saga = LogisticRegression(penalty = "l2", solver = 'saga',  max_iter = 1000)
t_model_lr_rr_saga = model_lr_rr_saga.fit(X,y)
score_lr_rr_saga = t_model_lr_rr_saga.score(X_test, y_test)

# Lasso = penalty l1 
model_lr_lasso = LogisticRegression(penalty = "l1", solver = 'saga',  max_iter = 1000)# this solver bc it is the only option for multiclass & l1
t_model_lr_lasso = model_lr_lasso.fit(X,y)
score_lr_lasso = t_model_lr_lasso.score(X_test, y_test)

# none= penalty none
model_lr_none_newton = LogisticRegression(penalty = "none", solver = 'newton-cg',  max_iter = 1000)
t_model_lr_none_newton = model_lr_none_newton.fit(X,y)
score_lr_none_newton = t_model_lr_none_newton.score(X_test, y_test)

model_lr_none_sag = LogisticRegression(penalty = "none", solver = 'sag',  max_iter = 1000)
t_model_lr_none_sag = model_lr_none_sag.fit(X,y)
score_lr_none_sag = t_model_lr_none_sag.score(X_test, y_test)

model_lr_none_saga = LogisticRegression(penalty = "none", solver = 'saga',  max_iter = 1000)
t_model_lr_none_saga = model_lr_none_saga.fit(X,y)
score_lr_none_saga = t_model_lr_none_saga.score(X_test, y_test)

# Lasso + Ridge = ??? = penalty elasticnet
model_lr_elastic = LogisticRegression(penalty = "elasticnet", solver = 'saga', l1_ratio = 0.5,  max_iter = 1000) # this solver bc it's the only option, l1_ratio just in the middle for now
t_model_lr_elastic = model_lr_elastic.fit(X,y)
score_lr_elastic = t_model_lr_elastic.score(X_test, y_test)


# further parameter tuning possible



# df for modifications, t_mode and score
mmsMI = pd.DataFrame()
mmsMI["modifications"] = ['RR newton', 'RR sag', 'RR saga', 'Lasso saga', 'none newton', 'none sag', 'none saga', 'Lasso+RR saga']
mmsMI["t_models"] = [t_model_lr_rr_newton, t_model_lr_rr_sag, t_model_lr_rr_saga, t_model_lr_lasso, t_model_lr_none_newton, t_model_lr_none_sag, t_model_lr_none_saga, t_model_lr_elastic]
mmsMI["simple scores"] = [score_lr_rr_newton, score_lr_rr_sag, score_lr_rr_saga, score_lr_lasso, score_lr_none_newton, score_lr_none_sag, score_lr_none_saga, score_lr_elastic]
mmsMI

Unnamed: 0,modifications,t_models,simple scores
0,RR newton,"LogisticRegression(max_iter=1000, solver='newt...",0.72449
1,RR sag,"LogisticRegression(max_iter=1000, solver='sag')",0.795918
2,RR saga,"LogisticRegression(max_iter=1000, solver='saga')",0.806122
3,Lasso saga,"LogisticRegression(max_iter=1000, penalty='l1'...",0.806122
4,none newton,"LogisticRegression(max_iter=1000, penalty='non...",0.581633
5,none sag,"LogisticRegression(max_iter=1000, penalty='non...",0.795918
6,none saga,"LogisticRegression(max_iter=1000, penalty='non...",0.806122
7,Lasso+RR saga,"LogisticRegression(l1_ratio=0.5, max_iter=1000...",0.806122


In [30]:
# training the upper models using cross validation
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from sklearn.model_selection import cross_val_score

cv10 = []
cv50 = []

for model in mms["t_models"]:
    cv = cross_val_score(model, X,y, cv = 10)
    cv10.append(cv)
    cv = cross_val_score(model, X,y, cv = 50)
    cv50.append(cv)
    
mms["cv score 10 fold"] = cv10
mms["cv score 50 fold"] = cv50

In [31]:
mms

Unnamed: 0,modifications,t_models,simple scores,cv score 10 fold,cv score 50 fold
0,RR newton,LogisticRegression(solver='newton-cg'),0.72449,"[0.5277777777777778, 0.6111111111111112, 0.416...","[0.375, 0.375, 0.625, 0.5, 0.75, 0.375, 0.625,..."
1,RR sag,LogisticRegression(solver='sag'),0.77551,"[0.5833333333333334, 0.6388888888888888, 0.444...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.5, 0..."
2,RR saga,LogisticRegression(solver='saga'),0.785714,"[0.6111111111111112, 0.6666666666666666, 0.472...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.625,..."
3,Lasso saga,"LogisticRegression(penalty='l1', solver='saga')",0.785714,"[0.6111111111111112, 0.6666666666666666, 0.472...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.625,..."
4,none newton,"LogisticRegression(penalty='none', solver='new...",0.632653,"[0.5, 0.6111111111111112, 0.3888888888888889, ...","[0.5, 0.375, 0.625, 0.75, 0.625, 0.625, 0.625,..."
5,none sag,"LogisticRegression(penalty='none', solver='sag')",0.77551,"[0.5833333333333334, 0.6666666666666666, 0.444...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.5, 0..."
6,none saga,"LogisticRegression(penalty='none', solver='saga')",0.785714,"[0.6111111111111112, 0.6666666666666666, 0.472...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.625,..."
7,Lasso+RR saga,"LogisticRegression(l1_ratio=0.5, penalty='elas...",0.785714,"[0.6111111111111112, 0.6666666666666666, 0.472...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.625,..."


In [32]:
# mean cv scores:
cvmean_score = []

for i in range(len(mms)):
    cvmeans = sum(mms.iloc[i,3]) / len(mms.iloc[i,3])
    cvmean_score.append(cvmeans)
    
mms["cv mean score 10"] = cvmean_score


cvmean_score = []

for i in range(len(mms)):
    cvmeans = sum(mms.iloc[i,4]) / len(mms.iloc[i,4])
    cvmean_score.append(cvmeans)
    
mms["cv mean score 50"] = cvmean_score

mms

Unnamed: 0,modifications,t_models,simple scores,cv score 10 fold,cv score 50 fold,cv mean score 10,cv mean score 50
0,RR newton,LogisticRegression(solver='newton-cg'),0.72449,"[0.5277777777777778, 0.6111111111111112, 0.416...","[0.375, 0.375, 0.625, 0.5, 0.75, 0.375, 0.625,...",0.511111,0.541071
1,RR sag,LogisticRegression(solver='sag'),0.77551,"[0.5833333333333334, 0.6388888888888888, 0.444...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.5, 0...",0.555556,0.581786
2,RR saga,LogisticRegression(solver='saga'),0.785714,"[0.6111111111111112, 0.6666666666666666, 0.472...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.625,...",0.541667,0.561786
3,Lasso saga,"LogisticRegression(penalty='l1', solver='saga')",0.785714,"[0.6111111111111112, 0.6666666666666666, 0.472...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.625,...",0.544444,0.561786
4,none newton,"LogisticRegression(penalty='none', solver='new...",0.632653,"[0.5, 0.6111111111111112, 0.3888888888888889, ...","[0.5, 0.375, 0.625, 0.75, 0.625, 0.625, 0.625,...",0.525,0.539643
5,none sag,"LogisticRegression(penalty='none', solver='sag')",0.77551,"[0.5833333333333334, 0.6666666666666666, 0.444...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.5, 0...",0.558333,0.578929
6,none saga,"LogisticRegression(penalty='none', solver='saga')",0.785714,"[0.6111111111111112, 0.6666666666666666, 0.472...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.625,...",0.541667,0.561786
7,Lasso+RR saga,"LogisticRegression(l1_ratio=0.5, penalty='elas...",0.785714,"[0.6111111111111112, 0.6666666666666666, 0.472...","[0.625, 0.75, 0.625, 0.375, 0.75, 0.75, 0.625,...",0.541667,0.558929


In [83]:
# --> none sag & RR sag with accuracy 81.3/82.2
# sourceFile = open('answer.csv', 'w')
# print('82.2', file = sourceFile)
# sourceFile.close()


In [89]:
answe=pd.read_csv('answers.csv')
answe

Unnamed: 0,class4 p
0,0 nonevent 0.154972
1,1 Ia 0.233739
2,2 Ib 0.410445
3,3 II 0.545141
4,4 nonevent 0.131124
5,.. ... ...
6,960 nonevent 0.328844
7,961 nonevent 0.066268
8,962 Ib 0.386109
9,963 Ib 0.260775


In [33]:
npf_hid=pd.read_csv('npf_test_hidden.csv')
npf_hid = npf_hid.drop("partlybad",axis=1)
npf_hid = npf_hid.drop("date",axis=1)
npf_hid = npf_hid.drop("class4",axis=1)

In [34]:
npf_hid = npf_hid.drop("id",axis=1)

In [42]:
npf_hid

Unnamed: 0,CO2168.mean,CO2168.std,CO2336.mean,CO2336.std,CO242.mean,CO242.std,CO2504.mean,CO2504.std,Glob.mean,Glob.std,...,T672.mean,T672.std,T84.mean,T84.std,UV_A.mean,UV_A.std,UV_B.mean,UV_B.std,CS.mean,CS.std
0,376.610169,0.526617,376.660339,0.500203,376.934655,0.564360,376.634746,0.471236,37.115592,24.180794,...,-2.851967,0.156421,-2.356066,0.174219,2.374336,1.318965,0.040709,0.024647,0.000644,0.000119
1,390.624932,0.453585,390.580694,0.455308,391.000685,0.505836,390.487945,0.481292,69.134531,59.895057,...,-18.646384,0.448865,-18.030984,0.478829,3.850439,2.056031,0.103446,0.060432,0.003707,0.000115
2,375.280258,1.249087,375.479806,1.241525,375.624129,1.423506,375.571474,1.200556,276.485371,201.722672,...,3.485822,1.526998,4.649689,1.978654,15.498463,11.001410,0.557332,0.494026,0.001637,0.000428
3,382.642176,3.222805,382.890412,3.026140,383.136941,3.466259,382.937706,2.837540,308.904304,287.444652,...,6.933127,3.737176,7.657725,3.823123,16.909366,13.924594,0.646806,0.602040,0.002480,0.000510
4,381.492971,4.386929,381.608000,4.333558,382.177784,4.491875,381.588857,4.263299,111.456879,104.807009,...,11.941411,2.195680,12.570801,2.458619,7.596930,6.722838,0.275559,0.312076,0.009429,0.004438
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,387.945654,8.594281,387.195208,6.395089,390.564398,12.503105,386.868542,5.561891,225.755653,245.560692,...,13.846229,1.908796,14.671475,2.278146,14.888953,13.891135,0.724499,0.741484,0.002978,0.000775
961,381.204386,0.172730,381.193333,0.174513,381.289123,0.174594,381.174737,0.179355,12.633440,6.528276,...,-5.539621,0.189930,-5.018333,0.192347,1.208696,0.588515,0.022685,0.011805,0.002506,0.000101
962,383.915986,2.407693,383.944965,2.349077,384.052183,2.456562,383.862958,2.212343,312.786155,206.675746,...,6.970141,2.270736,7.698230,2.916700,16.280512,10.828638,0.572551,0.464658,0.006474,0.000857
963,379.521641,1.199564,379.525194,1.147027,379.778906,1.257693,379.473203,1.078942,315.506008,166.933151,...,1.643628,2.549993,2.697919,2.916053,15.014020,8.972121,0.546366,0.416863,0.000829,0.000227


In [59]:
answers=model_lr_none_sag.predict(npf_hid) 

answers

array(['nonevent', 'Ia', 'Ib', 'II', 'nonevent', 'II', 'nonevent',
       'nonevent', 'II', 'nonevent', 'nonevent', 'nonevent', 'nonevent',
       'nonevent', 'II', 'nonevent', 'nonevent', 'nonevent', 'Ib',
       'nonevent', 'nonevent', 'nonevent', 'nonevent', 'nonevent',
       'nonevent', 'II', 'nonevent', 'II', 'nonevent', 'nonevent',
       'nonevent', 'Ib', 'nonevent', 'nonevent', 'II', 'II', 'II',
       'nonevent', 'Ia', 'nonevent', 'nonevent', 'Ib', 'II', 'nonevent',
       'II', 'nonevent', 'nonevent', 'II', 'Ib', 'nonevent', 'Ib',
       'nonevent', 'nonevent', 'II', 'nonevent', 'nonevent', 'Ib',
       'nonevent', 'II', 'nonevent', 'nonevent', 'II', 'nonevent',
       'nonevent', 'II', 'nonevent', 'nonevent', 'II', 'II', 'II', 'II',
       'Ib', 'nonevent', 'nonevent', 'nonevent', 'II', 'nonevent',
       'nonevent', 'nonevent', 'nonevent', 'II', 'nonevent', 'nonevent',
       'nonevent', 'Ib', 'II', 'nonevent', 'nonevent', 'nonevent', 'Ib',
       'nonevent', 'nonevent', '

In [79]:
p_class4=model.predict_proba(npf_hid)
p=p_class4[:, 0]
# proba=pd.Series()
# for i in p_class4:
# p=p_class4[:,0]

In [80]:
res=pd.DataFrame()
res['class4']=answers
res['p']=p


Unnamed: 0,class4,p
0,nonevent,0.154972
1,Ia,0.233739
2,Ib,0.410445
3,II,0.545141
4,nonevent,0.131124
5,II,0.542061
6,nonevent,0.042579
7,nonevent,0.033789
8,II,0.514498
9,nonevent,0.382616


In [88]:
sourceFile = open('answer.csv', 'w')
print(res, file = sourceFile)
sourceFile.close()
