In [259]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

In [260]:
# Import data

data_df = pd.read_csv('data.csv', delimiter=';')
verify_df = pd.read_csv('verify.csv', delimiter=';')

In [261]:
data_df.describe()

Unnamed: 0,ID,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_116,feature_117,feature_118,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124,target
count,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,...,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0,255820.0
mean,127910.5,0.356731,0.147756,0.002564,819.453049,41.053213,0.412751,0.355516,0.372434,0.063498,...,0.015581,0.026964,0.03599,0.005929,7201.150125,2866.290818,5904.254582,12100.793241,0.659698,0.197444
std,73849.017269,0.479035,0.354859,0.050574,772.966593,13.754019,0.49233,0.47867,0.483454,0.243857,...,0.123849,0.161979,0.256262,0.057655,8139.745183,3241.112692,4832.221046,9928.531737,0.343397,0.398071
min,1.0,0.0,0.0,0.0,-18.0,18.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1000.0,1000.0,1000.0,1000.0,0.016667,0.0
25%,63955.75,0.0,0.0,0.0,163.0,30.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2000.0,1000.0,2559.02405,5000.0,0.333333,0.0
50%,127910.5,0.0,0.0,0.0,562.0,38.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4000.0,2000.0,4277.7777,10000.0,0.705882,0.0
75%,191865.25,1.0,0.0,0.0,1329.0,52.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,10000.0,3000.0,7638.338725,16000.0,1.0,0.0
max,255820.0,1.0,1.0,1.0,3451.0,82.0,1.0,1.0,1.0,1.0,...,1.0,1.0,12.0,1.0,99000.0,60000.0,60000.0,99000.0,1.0,1.0


In [262]:
verify_df.describe()

Unnamed: 0,ID,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_115,feature_116,feature_117,feature_118,feature_119,feature_120,feature_121,feature_122,feature_123,feature_124
count,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,...,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0,56030.0
mean,283835.5,0.282474,0.204569,0.062092,876.721917,40.824951,0.376923,0.371944,0.363448,0.053668,...,0.211744,0.06327,0.104979,0.153775,0.032797,8496.420864,3748.0063,6954.350298,13743.502963,0.69827
std,16174.612128,0.450207,0.40339,0.241324,852.814244,13.702881,0.48462,0.483328,0.480997,0.225363,...,0.408548,0.24345,0.30653,0.544559,0.137901,9399.403349,4425.864312,5808.208225,11590.950763,0.336835
min,255821.0,0.0,0.0,0.0,0.0,18.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1000.0,1000.0,1000.0,1000.0,0.020202
25%,269828.25,0.0,0.0,0.0,164.0,30.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2500.0,2000.0,3150.0,5000.0,0.375
50%,283835.5,0.0,0.0,0.0,577.0,38.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5000.0,2000.0,5000.0,10000.0,0.857143
75%,297842.75,1.0,0.0,0.0,1428.0,51.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10000.0,4150.0,8588.2352,20000.0,1.0
max,311850.0,1.0,1.0,1.0,3636.0,81.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,10.0,1.0,100000.0,55900.0,59350.0,100000.0,1.0


In [263]:
# We have imbalanced dataset, therefore we might want to downsize sample with target=0

data_df.target.value_counts()

0    205310
1     50510
Name: target, dtype: int64

In [264]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = data_df[data_df.target==0]
df_minority = data_df[data_df.target==1]
 
# Downsample majority class
df_majority_downsampled = resample(df_majority, 
                                 replace=False,
                                 n_samples=50510, # Matches minority class
                                 random_state=123)
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Reset index
df_downsampled.reset_index(inplace=True, drop=True)
 
df_downsampled.target.value_counts()

0    50510
1    50510
Name: target, dtype: int64

In [287]:
# Normalize all features using MinMaxScaler
min_max_scaler = preprocessing.MinMaxScaler()
X_norm = min_max_scaler.fit_transform(df_downsampled.iloc[:, 1:-1])

# Replace nan with 0 to avoid running into a problem when fitting the model
X_norm = np.nan_to_num(X_norm)

X_norm

array([[1.        , 0.        , 0.        , ..., 0.05539359, 0.08255934,
        0.20903955],
       [1.        , 0.        , 0.        , ..., 0.06235828, 0.10319917,
        0.26040062],
       [0.        , 0.        , 0.        , ..., 0.02346939, 0.01341589,
        1.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.20680272, 0.11971104,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.30612245, 0.2373581 ,
        0.95762712],
       [0.        , 0.        , 0.        , ..., 0.00696864, 0.02063983,
        0.3220339 ]])

In [288]:
y_data = df_downsampled.target.to_numpy()
y_data

array([0, 0, 0, ..., 1, 1, 1])

In [289]:
# Split data into train and test samples

X_train, X_test, y_train, y_test = train_test_split(X_norm, y_data, test_size=0.25, random_state=123)

In [302]:
logistic_model = LogisticRegression(C=1.0, solver='liblinear')
logistic_model.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [291]:
# Solving question 1.2
# Getting the probabilities for question 1 of the ML task

prediction_probs = logistic_model.predict_proba(X_norm)
prediction_probs

array([[0.49564865, 0.50435135],
       [0.85307615, 0.14692385],
       [0.38130565, 0.61869435],
       ...,
       [0.50368947, 0.49631053],
       [0.50310294, 0.49689706],
       [0.46115743, 0.53884257]])

In [292]:
# Solving question 1.2

probs_df = pd.DataFrame(prediction_probs, columns=['prob 0', 'prob 1'])
probs_df['target'] = y_data
probs_df.drop(columns='prob 0', inplace=True)
probs_df

Unnamed: 0,prob 1,target
0,0.504351,0
1,0.146924,0
2,0.618694,0
3,0.145544,0
4,0.731957,0
...,...,...
101015,0.170541,1
101016,0.501323,1
101017,0.496311,1
101018,0.496897,1


In [293]:
# Solving question 1.2

index = ['(0.0, 0.1]', '(0.1, 0.2]', '(0.2, 0.3]', '(0.3, 0.4]', '(0.4, 0.5]', '(0.5, 0.6]', '(0.6, 0.7]', '(0.7, 0.8]', '(0.8, 0.9]', '(0.9, 1.0]']
probs_grouped_df = pd.DataFrame(index=index, columns=['Всего в группе', 'Кол-во target 1'])
probs_grouped_df

Unnamed: 0,Всего в группе,Кол-во target 1
"(0.0, 0.1]",,
"(0.1, 0.2]",,
"(0.2, 0.3]",,
"(0.3, 0.4]",,
"(0.4, 0.5]",,
"(0.5, 0.6]",,
"(0.6, 0.7]",,
"(0.7, 0.8]",,
"(0.8, 0.9]",,
"(0.9, 1.0]",,


In [294]:
# Solving question 1.2

probs_dict = {0.0:0.1, 0.1:0.2, 0.2:0.3, 0.3:0.4, 0.4:0.5, 0.5:0.6, 0.6:0.7, 0.7:0.8, 0.8:0.9, 0.9:1.0}

for lower, upper in probs_dict.items():
    df_lower_upper = probs_df[probs_df['prob 1'].between(lower, upper, inclusive='left')]
    
    vsego_v_gruppe = len(df_lower_upper)
    kolvo_target_1 = df_lower_upper['target'].sum()
    
    # Append results to answer probs_grouped_df
    probs_grouped_df.loc['({}, {}]'.format(str(lower), str(upper)), 'Всего в группе'] = vsego_v_gruppe
    probs_grouped_df.loc['({}, {}]'.format(str(lower), str(upper)), 'Кол-во target 1'] = kolvo_target_1

probs_grouped_df

Unnamed: 0,Всего в группе,Кол-во target 1
"(0.0, 0.1]",1621,104
"(0.1, 0.2]",9960,1229
"(0.2, 0.3]",12767,2902
"(0.3, 0.4]",12525,4354
"(0.4, 0.5]",13372,6106
"(0.5, 0.6]",14462,8386
"(0.6, 0.7]",13447,9105
"(0.7, 0.8]",10739,8146
"(0.8, 0.9]",7497,6110
"(0.9, 1.0]",4630,4068


In [308]:
# Question 1.3: Calculate model metrics

y_pred = logistic_model.predict(X_test)

f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, logistic_model.predict_proba(X_test)[:, 1])

print('Results:')
print('F1_score =',round(f1, 4))
print('Accuracy_score =',round(accuracy, 4))
print('roc_auc =',round(roc_auc, 4))

Results:
F1_score = 0.7034
Accuracy_score = 0.7037
roc_auc = 0.7711


In [296]:
# Working with verify dataset now

# Normalize all features using MinMaxScaler
min_max_scaler = preprocessing.MinMaxScaler()
verify_X_norm = min_max_scaler.fit_transform(verify_df.iloc[:, 1:])

# Replace nan with 0 to avoid running into a problem when fitting the model
verify_X_norm = np.nan_to_num(verify_X_norm)

verify_X_norm

array([[0.        , 0.        , 0.        , ..., 0.06085725, 0.11111111,
        0.48969072],
       [0.        , 0.        , 0.        , ..., 0.06341045, 0.05050505,
        1.        ],
       [0.        , 0.        , 0.        , ..., 0.05369894, 0.05050505,
        0.48969072],
       ...,
       [0.        , 0.        , 0.        , ..., 0.50762639, 0.35555556,
        0.82519793],
       [0.        , 0.        , 0.        , ..., 0.03439833, 0.09090909,
        0.2243299 ],
       [0.        , 0.        , 0.        , ..., 0.26306769, 0.23737374,
        1.        ]])

In [305]:
verify_prediction_probs = logistic_model.predict_proba(verify_X_norm)

verify_output_df = pd.DataFrame(index=verify_df['ID'], columns=['score'])
verify_output_df['score'] = verify_prediction_probs[:, 1]

verify_output_df

Unnamed: 0_level_0,score
ID,Unnamed: 1_level_1
255821,0.037799
255822,0.578373
255823,0.610877
255824,0.237980
255825,0.271271
...,...
311846,0.469988
311847,0.404677
311848,0.196888
311849,0.468120


In [309]:
verify_output_df.to_csv('verify_output.csv', sep=';')