# Logistic regression

### Importing

In [1]:
import numpy
import pandas
import matplotlib.pyplot as plt
import scipy.optimize

### Load Data

In [4]:
timeuse = pandas.read_csv('/Users/luisr/Desktop/Repository/IBM_courses/Samples/timeuse_cleaned.csv')
timeuse.drop('Unnamed: 0', axis=1, inplace=True)

In [11]:
timeuse.head(2)

Unnamed: 0,SEX,GEO/ACL00,Total,Personal care,Sleep,Eating,Other and/or unspecified personal care,"Employment, related activities and travel as part of/during main and second job",Main and second job and related travel,Activities related to employment and unspecified employment,...,Unspecified leisure,Travel except travel related to jobs,Travel to/from work,Travel related to study,Travel related to shopping and services,Transporting a child,Travel related to other household purposes,"Travel related to leisure, social and associative life",Unspecified travel,Unspecified time use
0,Males,Belgium,24:00,10.75,8.25,1.816667,0.7,3.116667,3.083333,0.033333,...,0.016667,1.5,0.416667,0.033333,0.266667,0.05,0.0,0.25,0.5,0.016667
1,Males,Bulgaria,24:00,11.9,9.133333,2.116667,0.65,3.533333,3.45,0.066667,...,0.016667,1.116667,0.383333,0.0,0.2,0.016667,0.1,0.35,0.05,0.033333


### Target and Features Selection

In [9]:
dish_wash_corr = timeuse.corr()[['Dish washing']].sort_values('Dish washing', ascending=False)
dish_wash_corr.head(7)

Unnamed: 0,Dish washing
Dish washing,1.0
Food management except dish washing,0.961601
Household and family care,0.957699
Laundry,0.889684
Cleaning dwelling,0.869748
Handicraft and producing textiles and other care for textiles,0.851787
"Childcare, except teaching, reading and talking",0.804011


### Sample Data

In [81]:
sample = timeuse[['SEX','GEO/ACL00']+dish_wash_corr.index[0:6].tolist()]
sample.head(2)

Unnamed: 0,SEX,GEO/ACL00,Dish washing,Food management except dish washing,Household and family care,Laundry,Cleaning dwelling,Handicraft and producing textiles and other care for textiles
0,Males,Belgium,0.166667,0.366667,2.466667,0.016667,0.133333,0.0
1,Males,Bulgaria,0.083333,0.25,2.616667,0.016667,0.1,0.0


### Feature Transformations

#### Label Encoding

In [82]:
from sklearn import preprocessing

sample.SEX = preprocessing.LabelEncoder().fit(['Females', 'Males']).transform(sample['SEX'])
sample['GEO/ACL00'] = preprocessing.LabelEncoder().fit(sample['GEO/ACL00'].unique()).transform(sample['GEO/ACL00'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample['GEO/ACL00'] = preprocessing.LabelEncoder().fit(sample['GEO/ACL00'].unique()).transform(sample['GEO/ACL00'])


Note SEX and GEO columns labeled.

In [90]:
sample.head(3)

Unnamed: 0,SEX,GEO/ACL00,Dish washing,Food management except dish washing,Household and family care,Laundry,Cleaning dwelling,Handicraft and producing textiles and other care for textiles
0,1,0,0.166667,0.366667,2.466667,0.016667,0.133333,0.0
1,1,1,0.083333,0.25,2.616667,0.016667,0.1,0.0
2,1,5,0.133333,0.266667,2.366667,0.033333,0.183333,0.0


#### Normalization

Target and features definition

In [87]:
y = sample.values[:,0]
x = sample.values[:,1:]

Standartization of features only

In [93]:
from sklearn import linear_model
x_norm = preprocessing.StandardScaler().fit(x).transform(x)

In [97]:
y, x_norm[:4], x[:4]

(array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]),
 array([[-1.61245155, -0.52303646, -0.81987264, -0.76262496, -0.95350359,
         -0.87819529, -0.96157421],
        [-1.36438208, -1.00797092, -1.07174573, -0.63758821, -0.95350359,
         -0.98510602, -0.96157421],
        [-0.3721042 , -0.71701024, -1.03576386, -0.8459828 , -0.7743217 ,
         -0.71782919, -0.96157421],
        [-1.11631261, -0.91098403, -0.85585451, -0.6931601 , -0.95350359,
         -0.82473992, -0.75913753]]),
 array([[0.        , 0.16666667, 0.36666667, 2.46666667, 0.01666667,
         0.13333333, 0.        ],
        [1.        , 0.08333333, 0.25      , 2.61666667, 0.01666667,
         0.1       , 0.        ],
        [5.        , 0.13333333, 0.26666667, 2.36666667, 0.03333333,
         0.18333333, 0.        ],
        [2.        , 0.1       , 0.35      , 2.55      , 0.01666667,
         0.15      , 0.01666667]]))

### Split Sample into Train and Test Sets

In [134]:
from sklearn import model_selection

x_train, x_test, y_train, y_test = model_selection.train_test_split(x_norm, y, test_size=0.4, random_state=4)

### Train logistic Regression Model

In [135]:
from sklearn import linear_model

log_regr = linear_model.LogisticRegression(C=0.01, solver='liblinear').fit(x_train, y_train)


### Train and Test Samples Prediction

In [142]:
y_tst_pred = log_regr.predict(x_test)
y_trn_pred = log_regr.predict(x_train)

y_tst_prob = log_regr.predict_proba(x_test)
y_trn_prob = log_regr.predict_proba(x_train)

### Evaluation

#### Jaccard Similarity Evaluation

In [None]:
from sklearn.metrics import jaccard_score       

log_jacc = {'tst_scr':jaccard_similarity_score(y_test, y_tst_pred),
        'trn_scr':jaccard_similarity_score(y_train, y_trn_pred)}

#### Log loss

In [144]:
from sklearn import metrics

log_loss_tst = metrics.log_loss(y_test, y_tst_prob)
log_loss_trn = metrics.log_loss(y_train, y_trn_prob)

In [145]:
log_loss_tst, log_loss_trn

(0.5453235664927273, 0.5533774631745951)

#### Classification Report

In [149]:
from sklearn import metrics

print(metrics.classification_report(y_test, y_tst_pred))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         8
         1.0       1.00      1.00      1.00         4

    accuracy                           1.00        12
   macro avg       1.00      1.00      1.00        12
weighted avg       1.00      1.00      1.00        12



#### Confusion Matrix

In [155]:
from sklearn import metrics                                       

#metrics.confusion_matrix(y_test, y_tst_pred)              # Update with colormap.
metrics.confusion_matrix(y_train, y_trn_pred)


array([[ 6,  0],
       [ 0, 10]], dtype=int64)

#### Classification Accuracy Score

In [138]:
from sklearn import metrics

log_acc = {'tst_acc' : metrics.accuracy_score(y_test, y_tst_pred), 'trn_acc' : metrics.accuracy_score(y_train, y_trn_pred)}

In [139]:
print(log_acc)

{'tst_acc': 1.0, 'trn_acc': 1.0}


### Visualizing Predictions

In [156]:
pandas.DataFrame({'Out-Sample Prediction' : y_tst_pred, 'Actual Value' : y_test, 'Successful Prediction': y_tst_pred==y_test})    # Yes, 9 out of 9 :)

Unnamed: 0,Out-Sample Prediction,Actual Value,Successful Prediction
0,0.0,0.0,True
1,0.0,0.0,True
2,1.0,1.0,True
3,0.0,0.0,True
4,0.0,0.0,True
5,0.0,0.0,True
6,0.0,0.0,True
7,0.0,0.0,True
8,1.0,1.0,True
9,1.0,1.0,True


In [157]:
pandas.DataFrame({'In-Sample Prediction' : y_trn_pred, 'Actual Value' : y_train, 'Successful Prediction': y_trn_pred==y_train})

Unnamed: 0,In-Sample Prediction,Actual Value,Successful Prediction
0,1.0,1.0,True
1,1.0,1.0,True
2,1.0,1.0,True
3,1.0,1.0,True
4,0.0,0.0,True
5,1.0,1.0,True
6,1.0,1.0,True
7,1.0,1.0,True
8,0.0,0.0,True
9,0.0,0.0,True
