# Logistic Regression

In [26]:
import pandas as pd
import numpy as np
import glob

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

from collections import Counter

import math

In [2]:
import matplotlib.pyplot as plt

# Load data into panda-frames and inspect

In [17]:
weekly_activity_df = pd.read_pickle('weekly_activity_format.pkl')

X_week = np.array(list(weekly_activity_df['X']))
y = np.array(weekly_activity_df['y'])

In [76]:
X_week.shape, y.shape

((28993, 20, 42), (28993,))

# Data in activity types

## Format X-data 

In [19]:
# in this analysis we do not use temporal states so we sum all weeks to one row
X = []
for week_matrix in X_week: 
    X.append(week_matrix.sum(axis=1))

In [21]:
X = np.array(X)

## Train Logistic Regression Model

In [28]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [29]:
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

In [30]:
# print the counted y_train and y test
cnt_binary = Counter(list(y_train))
print("Training data: ", cnt_binary)
cnt_binary = Counter(list(y_test))
print("testing data: ", cnt_binary)

Training data:  Counter({1: 12340, 0: 10854})
testing data:  Counter({1: 3042, 0: 2757})


In [32]:
# fit model
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.74      0.83      0.78      2757
           1       0.83      0.74      0.78      3042

    accuracy                           0.78      5799
   macro avg       0.78      0.78      0.78      5799
weighted avg       0.79      0.78      0.78      5799



## Cross validations Scores

In [35]:
scores = cross_val_score(clf, X, y, cv=5)

In [36]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.75 accuracy with a standard deviation of 0.05


## Print latex friendly format

In [52]:
classification_report_all = classification_report(y_test, y_pred)

In [53]:
def print_latex_report(classification_report): 
    for line in classification_report.split("\n"): 
        if (len(line.split()) > 0): 
            for print_item in line.split()[:-1]: 
                print(print_item, " & ", end="")
            print(line.split()[-1], " \\\\")


In [54]:
print_latex_report(classification_report_all)

precision  & recall  & f1-score  & support  \\
0  & 0.82  & 0.89  & 0.85  & 2757  \\
1  & 0.89  & 0.82  & 0.85  & 3042  \\
accuracy  & 0.85  & 5799  \\
macro  & avg  & 0.85  & 0.85  & 0.85  & 5799  \\
weighted  & avg  & 0.85  & 0.85  & 0.85  & 5799  \\


## First half of the VLE data

In [58]:
half_week_period = int(42/2)

In [59]:
# in this analysis we do not use temporal states so we sum all weeks to one row
X_half = []
for week_matrix in X_week: 
    X_half.append(week_matrix[:, :half_week_period].sum(axis=1))

In [60]:
# standarddize data
X_half = scaler.fit_transform(X_half)

In [61]:
# split into train and test set
X_half_train, X_half_test, y_train, y_test = train_test_split(X_half, y, test_size=0.2, random_state = 3)

In [62]:
# print the counted y_train and y test
cnt_binary = Counter(list(y_train))
print("Training data: ", cnt_binary)
cnt_binary = Counter(list(y_test))
print("testing data: ", cnt_binary)

Training data:  Counter({1: 12340, 0: 10854})
testing data:  Counter({1: 3042, 0: 2757})


In [63]:
# fit model
clf = LogisticRegression(random_state=0).fit(X_half_train, y_train)

y_pred = clf.predict(X_half_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.66      0.76      0.71      2757
           1       0.75      0.65      0.70      3042

    accuracy                           0.70      5799
   macro avg       0.71      0.70      0.70      5799
weighted avg       0.71      0.70      0.70      5799



### Cross validation results

In [64]:
scores = cross_val_score(clf, X_half, y, cv=5)

In [65]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.67 accuracy with a standard deviation of 0.03


### Print Latex friendly

In [66]:
classification_report_half = classification_report(y_test, y_pred)

In [67]:
print_latex_report(classification_report_half)

precision  & recall  & f1-score  & support  \\
0  & 0.66  & 0.76  & 0.71  & 2757  \\
1  & 0.75  & 0.65  & 0.70  & 3042  \\
accuracy  & 0.70  & 5799  \\
macro  & avg  & 0.71  & 0.70  & 0.70  & 5799  \\
weighted  & avg  & 0.71  & 0.70  & 0.70  & 5799  \\


## Second half of VLE data

In [70]:
# in this analysis we do not use temporal states so we sum all weeks to one row
X_last_half = []
for week_matrix in X_week: 
    X_last_half.append(week_matrix[:, half_week_period:].sum(axis=1))

In [71]:
# standarddize data
X_last_half = scaler.fit_transform(X_last_half)

In [72]:
X_last_half_train, X_last_half_test, y_train, y_test = train_test_split(X_last_half, y, test_size=0.2, random_state = 3)

In [73]:
# fit model
clf = LogisticRegression(random_state=0).fit(X_last_half_train, y_train)

y_pred = clf.predict(X_last_half_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.89      0.85      2757
           1       0.89      0.82      0.85      3042

    accuracy                           0.85      5799
   macro avg       0.85      0.85      0.85      5799
weighted avg       0.85      0.85      0.85      5799



### Cross validation 

In [74]:
scores = cross_val_score(clf, X_last_half, y, cv=5)

In [75]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.84 accuracy with a standard deviation of 0.05


# Data in weekly clicks

In [77]:
# in this analysis we do not use temporal states so we sum all weeks to one row
X = []
for week_matrix in X_week: 
    X.append(week_matrix.sum(axis=0))

X = np.array(X)

## All data

In [81]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 3)

# print the counted y_train and y test
cnt_binary = Counter(list(y_train))
print("Training data: ", cnt_binary)
cnt_binary = Counter(list(y_test))
print("testing data: ", cnt_binary)
print()

# fit model
clf = LogisticRegression(random_state=0).fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

Training data:  Counter({1: 12340, 0: 10854})
testing data:  Counter({1: 3042, 0: 2757})

              precision    recall  f1-score   support

           0       0.79      0.90      0.84      2757
           1       0.90      0.78      0.84      3042

    accuracy                           0.84      5799
   macro avg       0.84      0.84      0.84      5799
weighted avg       0.85      0.84      0.84      5799



### Cross validation

In [82]:
scores = cross_val_score(clf, X_last_half, y, cv=5)

print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

0.84 accuracy with a standard deviation of 0.05
