# Notebook for data analysis for 4th year dissertation
#### Anguel Hristozov, 2255541h

### Imports

In [None]:
#
import json
import re
from copy import deepcopy
from os import getenv, path
from sys import exit

import eli5
import googlemaps
import matplotlib.pyplot as plt
import mysql.connector
import numpy as np
import pandas as pd
import pydotplus
import seaborn as sns
import swifter
from dotenv import load_dotenv
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from IPython.core.interactiveshell import InteractiveShell
from IPython.display import Image
from joblib import dump, load
from mapbox import Geocoder
from scipy.sparse import csr_matrix
from six import StringIO
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.ensemble import (ExtraTreesClassifier, GradientBoostingClassifier,
                              RandomForestClassifier)
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, auc, classification_report,
                             confusion_matrix, fbeta_score, precision_score,
                             recall_score, roc_auc_score, roc_curve)
from sklearn.model_selection import (GridSearchCV, KFold, LeavePOut,
                                     ShuffleSplit, StratifiedKFold,
                                     StratifiedShuffleSplit, cross_val_predict,
                                     cross_val_score, cross_validate,
                                     learning_curve, train_test_split,
                                     validation_curve)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, label_binarize, MinMaxScaler
from sklearn.tree import (DecisionTreeClassifier, ExtraTreeClassifier,
                          export_graphviz, plot_tree)

load_dotenv()

### Globals and settings

In [None]:
#
%load_ext autoreload
%autoreload 2

from IPython.core.display import display, HTML
display(HTML("<style>div.output_scroll { height: 44em; }</style>"))

# globals
IGNORE_ANDROID_10 = True  # toggle to use extra android 10 data feature points or not
ANDROID_10_FEATURES = ('lastTimeForegroundServiceUsed', 'lastTimeVisible',
                       'totalTimeForegroundServiceUsed', 'totalTimeVisible'
                       )  # the android 10 data feature points
DIFFERENTIATE_BETWEEN_PHOBIA_ANXIETY = False

UNKNOWN_APP_NAME = '<Unknown App>'
UNKNOWN_CATEGORY = '<Unknown Category>'
TEST_SIZE = 0.2
GEO_FILE = 'saved_reverse_geocodes'

# notebook settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
InteractiveShell.ast_node_interactivity = "all"

### Reading in data

In [None]:
##### IMPORTANT
# for the duration of this project data was read in from a mariadb table. I added the csv option only for project submission.
# all data was exported to csv files from the mysql data export wizard.
# however results should not change

# read data from mariadb table
def read_data_sql(com):
    con = 'mysql+mysqlconnector://admin:password@127.0.0.1:3306/Dissertation'
    return pd.read_sql_table(com, con=con)

# read from csv
def read_data(file):
    return pd.read_csv('data/{}.csv'.format(file), sep=',')

# load data
call_df = read_data('calls')  # not used
user_df = read_data('user')
category_df = read_data('app_categories')
location_df = read_data('locations')
session_df = read_data('user_session_data')
    
# remove the id column that comes from the database. isn't necessary
call_df = call_df.drop(columns='id')
location_df = location_df.drop(columns='id')
session_df = session_df.drop(columns='id')

### Classifier Info Functions

In [None]:
#
sns.set_context('paper')
plt.style.use('seaborn-paper')
def evaluation_summary(description, predictions, true_labels, c, train_data, save=False, sideways=False, drop=['sias']):
    precision = precision_score(predictions, true_labels, average='binary')
    recall = recall_score(predictions, true_labels, average='binary')
    accuracy = accuracy_score(predictions, true_labels)
    f1_macro = fbeta_score(predictions, true_labels, 1, average='binary') #1 means f_1 measure
    print("Classifier '%s' has Acc=%0.3f P=%0.3f R=%0.3f F1=%0.3f" % (description,accuracy,precision,recall,f1_macro))
    con_matrix = confusion_matrix(true_labels, predictions)
    df_con_matrix = pd.DataFrame(con_matrix, index=['Neither', 'Anxious'], columns=['Neither', 'Anxious'])
    plt.figure()
    heatmap = sns.heatmap(df_con_matrix, annot=True, fmt='g')
    plt.tight_layout()
    if save:
        image = heatmap.get_figure()
        image.savefig('heatmap_{}.pdf'.format(description), dpi=300)
    plt.show()
    print(classification_report(predictions, true_labels, digits=3))
    report = classification_report(predictions, true_labels, digits=3, output_dict=True)
    labels = list(report.keys())[:-3]
    f1_values = [report[i]['f1-score'] for i in labels]
    graph = plt.bar(labels, f1_values)
    plt.xticks(rotation='vertical')
    plt.xlabel('SIAS Category')
    plt.ylabel('F1 Values')
    plt.title('F1 Values for Classifier Classes')
    plt.tight_layout()
    if save:
        plt.savefig('f1_graph_{}.pdf'.format(description), dpi=300)
    plt.show()
    if hasattr(c, 'feature_importances_'):
        importances = c.feature_importances_
        idx = np.argsort(importances)[::-1]
        feature_names = [train_data.drop(columns=drop).columns[i] for i in idx]
        plt.figure()
        plt.title('Feature Importances for {}'.format(description))
        if sideways:
            plt.barh(range(train_data.drop(columns=drop).shape[1]), (importances[idx]))
            plt.yticks(range(train_data.drop(columns=drop).shape[1]), feature_names)
            plt.ylabel('Feature')
            plt.xlabel('Overall Importance')
        else:
            plt.xlabel('Feature')
            plt.ylabel('Overall Importance')
            plt.bar(range(train_data.drop(columns=drop).shape[1]), importances[idx])
            plt.xticks(range(train_data.drop(columns=drop).shape[1]), feature_names, rotation=45)
        if save:
            plt.savefig('feature_importance_{}.pdf'.format(description), dpi=300, bbox_inches='tight')
        plt.show()
    
def plot_bar(data, x_label, title, save=False, sideways=False):
    plt.title(title)
    plt.tight_layout()
    if sideways:
        sc = sns.countplot(data=data, y=x_label, palette='colorblind')
    else:
        sc = sns.countplot(data=data, x=x_label, palette='colorblind')
    if save:
        plt.savefig('bar_chart_{}.pdf'.format(title), dpi=300, bbox_inches='tight')
    return plt
    
def plot_roc(name, y_test, y_score, n_classes=2, save=False):
    ns_probs = [0 for _ in range(len(y_test))]
    lr_probs = y_score[:, 1]
    
    ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
    lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
    
    ns_auc = roc_auc_score(y_test, lr_probs)
    
    plt.plot(ns_fpr, ns_tpr, linestyle='--', color='navy', lw=2)
    
    plt.plot(lr_fpr, lr_tpr, color='darkorange',lw=2,
             label='ROC curve (area = {:02.2f})'.format(ns_auc))
    
    plt.xlabel('False Positive Rate')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.ylabel('True Positive Rate')
    plt.legend(loc='lower right')
    plt.title('Receiver Operating Curve for {}'.format(name))
    plt.tight_layout()
    if save:
        plt.savefig('roc_{}.pdf'.format(name), dpi=300)
    return plt

def graph_correlation(title, data, save=False, rotation=60, figsize=None):
    if figsize is not None:
        plt.figure(figsize=figsize)
    else:
        plt.figure()
    cor = sns.heatmap(data.corr(), cmap='YlGnBu', annot_kws={'size':30}, square=True)
    plt.title('{} Correlation'.format(title))
    plt.xticks(rotation=rotation)
    plt.tight_layout()
    if save:
        image = cor.get_figure()
        image.savefig('correlation_{}.pdf'.format(title), dpi=300, bbox_inches='tight')
    else:
        plt.show()

# adapted from:        
# https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html#sphx-glr-auto-examples-model-selection-plot-learning-curve-py
def plot_learning_curve(estimator, title, X, y, axes=None, ylim=None, cv=None,
                        n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5), save=False):
    train_sizes, train_scores, test_scores, fit_times, _ = \
        learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
                       train_sizes=train_sizes,
                       return_times=True)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    fit_times_mean = np.mean(fit_times, axis=1)
    fit_times_std = np.std(fit_times, axis=1)    
    # Plot learning curve
    plt.figure()
    plt.grid()
    plt.ylim(*ylim)
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                         train_scores_mean + train_scores_std, alpha=0.1,
                         color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1,
                         color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
                 label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    plt.legend(loc="best")
    if save:
        plt.tight_layout()
        plt.savefig('learning_curve_1_{}.pdf'.format(title), dpi=300, bbox_inches='tight')
    plt.show()
    plt.figure()
    plt.grid()
    plt.plot(train_sizes, fit_times_mean, 'o-')
    plt.fill_between(train_sizes, fit_times_mean - fit_times_std,
                         fit_times_mean + fit_times_std, alpha=0.1)
    plt.xlabel("Training examples")
    plt.ylabel("fit_times")
    plt.title("Scalability of the model")
    if save:
        plt.tight_layout()
        plt.savefig('learning_curve_2_{}.pdf'.format(title), dpi=300, bbox_inches='tight')
    plt.show()
    plt.figure()
    plt.grid()
    plt.plot(fit_times_mean, test_scores_mean, 'o-')
    plt.fill_between(fit_times_mean, test_scores_mean - test_scores_std,
                         test_scores_mean + test_scores_std, alpha=0.1)
    plt.xlabel("fit_times")
    plt.ylabel("Score")
    plt.title("Performance of the model")
    if save:
        plt.tight_layout()
        plt.savefig('learning_curve_3_{}.pdf'.format(title), dpi=300, bbox_inches='tight')
    plt.show()
    return plt


# https://scikit-learn.org/stable/auto_examples/model_selection/plot_validation_curve.html#sphx-glr-auto-examples-model-selection-plot-validation-curve-py
def plot_validation_curve(c_name, clf, train_data, train_labels, param_name, param_range, save=False):
#     param_range = np.logspace(-6, -1, 5)
    train_scores, test_scores = validation_curve(
        clf, train_data, train_labels, param_name=param_name, param_range=param_range, scoring="accuracy", n_jobs=1)
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.title("Validation Curve with {}".format(c_name))
    plt.xlabel(r"$\gamma$")
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    lw = 2
    plt.semilogx(param_range, train_scores_mean, label="Training score",
                 color="darkorange", lw=lw)
    plt.fill_between(param_range, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.2,
                     color="darkorange", lw=lw)
    plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
                 color="navy", lw=lw)
    plt.fill_between(param_range, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.2,
                     color="navy", lw=lw)
    plt.legend(loc="best")
    plt.tight_layout()
    if save:
        plt.savefig('validation_curve_{}.pdf'.format(c_name), dpi=300)
    return plt

In [None]:
# taken from survey data
location_comfort_level = {
    'level': [1,2,3,4,5],
    'amount': [0,2,3,5,4]
}
p = pd.DataFrame(location_comfort_level)

plt.figure()
plt.title('How Comfortable Participants Felt About Location Tracking')
plt.ylabel('Amount of Participants')
plt.xlabel('Level (Increased Level Means More Comfortable)')
plt.bar(range(1,6), p['amount'])
plt.tight_layout()
plt.savefig('location_tracking_comfort.pdf', dpi=300, bbox_inches='tight')
plt.show()

data_analysis_comfort = {
    'level': [1,2,3,4,5],
    'amount': [0,2,4,3,5]
}

p = pd.DataFrame(data_analysis_comfort)

plt.figure()
plt.title('How Comfortable Participants Felt About Data Analysis')
plt.ylabel('Amount of Participants')
plt.xlabel('Level (Increased Level Means More Comfortable)')
plt.bar(range(1,6), p['amount'])
plt.tight_layout()
plt.savefig('data_analysis_comfort.pdf', dpi=300, bbox_inches='tight')
plt.show()

behaviour_change = {
    'level': ['Yes','No'],
    'amount': [2, 13]
}

p = pd.DataFrame(behaviour_change)

plt.figure()
plt.title('Did Behaviour of Participants Change Since Usage Data was Recorded')
plt.ylabel('Amount of Participants')
plt.xlabel('Answer')
plt.bar(p['level'], p['amount'])
plt.tight_layout()
plt.savefig('behaviour_change.pdf', dpi=300, bbox_inches='tight')
plt.show()

usage_stats = {
    'level': ['Yes','No', 'Neither'],
    'amount': [7, 3, 4]
}

p = pd.DataFrame(usage_stats)

plt.figure()
plt.title('Did Participants Feel Comfortable Knowing That Their Usage Data is Readily Available')
plt.ylabel('Amount of Participants')
plt.xlabel('Answer')
plt.bar(p['level'], p['amount'])
plt.tight_layout()
plt.savefig('usage_stats.pdf', dpi=300, bbox_inches='tight')
plt.show()



### Initial data statistics

In [None]:
#
bar = plot_bar(user_df, 'sias', 'SIAS Score Distribution', save=True)
bar.show()

### Pipeline manipulators

In [None]:
# 
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.    """
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]
    

class SparseTranspose(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
  
    def transform(self, X):
        csr = csr_matrix(X)
        return csr.transpose()


class TransposeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self.toarray().array.reshape(1, -1)

    def transform(self, X):
        return X.toarray()

### Converters and column manipulators

In [None]:
#
def map_undefined_to_actual(string_name):
    undefined_apps = {
        'UNDEFINED_codetivelab.macfinder.bluetooth.bluetoothmacfinder':
        'Bluetooth Mac Finder',
        'UNDEFINED_com.cyclingapp': 'CyclingApp',
        'UNDEFINED_com.Slack': 'Slack',
        'UNDEFINED_com.sonelli.juicessh': 'JuiceSSH - SSH Client',
        'UNDEFINED_com.termux': 'Termux',
        'UNDEFINED_com.vsrevogroup.revouninstallermobile':
        'Revo Uninstaller Mobile',
        'UNDEFINED_com.zhiliaoapp.musically': 'Musically',
        'com.oneplus.wifiapsettings': 'OnePlus Wifi AP Settings',
        'UNDEFINED_org.schabi.newpipe': 'NewPipe',
        'UNDEFINED_com.bumble.app': 'Bumble',
        'UNDEFINED_com.backbone': 'Backbone',
        'UNDEFINED_com.elevenkings.football': 'Eleven Kings',
        'UNDEFINED_com.google.android.calendar': 'Google Calendar',
        'UNDEFINED_com.imaginecurve.curve.prd': 'Curve',
        'UNDEFINED_com.instagram.android': 'Instagram',
        'UNDEFINED_com.jamworks.alwaysondisplay': 'Always On Display',
        'UNDEFINED_com.mttnow.droid.easyjet': 'easyJet',
        'UNDEFINED_com.net.furaffrate.furaffinity': 'NOC for Fur Affinity',
        'UNDEFINED_com.plarium.vikings': 'Vikings',
        'UNDEFINED_com.shpock.android': 'Shpock',
        'UNDEFINED_com.ticketmaster.mobile.android.uk': 'TicketmasterUK',
        'UNDEFINED_com.tinder': 'Tinder',
        'UNDEFINED_com.ubercab': 'Uber'
    }
    return undefined_apps[
        string_name]  # throw error if cannot be found so that it can be noticed


# data converters
def convert_session_app_data(string):
    #     takes in a dictionary as a string and convert it to dictionary
    obj = dict()
    pattern = re.compile(r'[\w]+=[\w ]+')
    matches = pattern.findall(string)
    for match in matches:
        split_match = match.split('=')
        if split_match[0] == 'name':  # with name can get category
            name = split_match[1]
            if name is None or name == '' or name == ' ':
                print("name is none...:" + str(match))
                print(app_mappings.get(name, app_mappings.get(UNKNOWN_APP_NAME)))

            if name.startswith('UNDEFINED') or name.startswith('com.'):
                name = map_undefined_to_actual(name)

            if name.lower(
            ) in [  # for some reason different phones spell these without title case on the second word
                    'keep notes', 'android system', 'file manager',
                    'settings suggestions', 'call management'
            ]:
                name = name.title()

            obj[split_match[0]] = name
            try:
                obj['category'] = category_df.loc[category_df['app_name'] ==
                                                  name].values[0][1]
            except:
                print(name)
        else:
            if IGNORE_ANDROID_10 and split_match[0] in ANDROID_10_FEATURES:
                continue
            obj[split_match[0]] = int(split_match[1])

    return obj


def convert_session_data_list(session_data):
    #     convert list of dictionary strings to list of proper dictionary objects
    obj = list()
    pattern = re.compile(r'(\{[A-Za-z0-9_=, ]+\})')
    matches = pattern.findall(session_data)
    for match in matches:
        obj.append(convert_session_app_data(match))
    return obj


# add the sias score as a string label, effectively placing everything in buckets
def add_sias_score(uid):
    sias = user_df.loc[user_df['uid'] == uid].values[0][1]
    if DIFFERENTIATE_BETWEEN_PHOBIA_ANXIETY:
        if sias >= 42:
            return 2
        elif sias < 42 and sias >= 34:
            return 1
    else:
        if sias >= 34:
            return 1
    return 0


def add_sias_score_scalar(uid):
    return user_df.loc[user_df['uid'] == uid].values[0][1]

### Location

In [None]:
#
location_df = location_df.drop(columns=['hAccuracy', 'altitude', 'vAccuracy', 'bearing', 'bearingAccuracy',
                                        'speedAccuracy', 'elapsedNanosSinceBoot', 'provider', 'elapsedNanosLocation', 'locationTimestamp'])
location_df['sias'] = location_df.uid.swifter.progress_bar(True).apply(add_sias_score)

In [None]:
#
rounded_location_df = location_df.round({'latitude': 7, 'longitutde': 7})  # round lat/lng to be able to remove duplicates
rounded_location_df['latitude'] = rounded_location_df['latitude'].astype('str')
rounded_location_df['longitude'] = rounded_location_df['longitude'].astype('str')

unique_location_df = rounded_location_df.drop_duplicates(subset=['latitude', 'longitude'])

In [None]:
#
gmaps = googlemaps.Client(key=getenv('MAPS_KEY'))

def convert_coords_to_locs(row):
    lat = str(row['latitude'])
    lng = str(row['longitude'])
    time = str(row['systemTimestamp'])
    key = (lat,lng,time)
    r_g = gmaps.reverse_geocode((lat, lng))

In [None]:
##### VERY IMPORTANT, ONLY RUN THIS ONCE AND THEN SAVE THE DICTIONARY TO A FILE
##### there is a limited amount of calls to the api that can be made before running out of credits
##### no point paying extra if the results can be cached
##### the saving is done below
saved_locations = dict()
if path.exists(GEO_FILE) and path.isfile(GEO_FILE):
    saved_locations = load(GEO_FILE)
else:
    unique_location_df.swifter.progress_bar(True).apply(convert_coords_to_locs, axis=1)  # takes about a half hour on 8 core i7 from 2012 and 24 gb ram
    dump(value=saved_locations, filename=GEO_FILE)

In [None]:
# keep the original saved, in case
saved_locations_copy = deepcopy(saved_locations)
# convert keys to strings because pickling converts them to floats
saved_locations_copy_with_str = {(str(k[0]), str(k[1])):v for k,v in saved_locations_copy.items()}

In [None]:
# adapted from Edward Wood's method
def add_loc_category(row):
    lat = row['latitude']
    lng = row['longitude']
    key = (lat,lng)
    if key in saved_locations_copy_with_str:
        amenities = list()
        visited = {}
        address = saved_locations_copy_with_str.get(key)
        d = int(row['systemTimestamp'])
        for a in address:
            for i in range(0, len(a['types'])):
                if a['types'][i] == 'establishment':
                    amenities.append(a['types'][i+1])
        if len(amenities) > 0:
            if amenities[0] == "natural_feature":
                amenities[0] = "home"
            if amenities[0] in visited.keys():
                visited[amenities[0]] += d
            else:
                visited[amenities[0]] = d
        else:
            if "home" in visited.keys():
                visited["home"] += d
            else:
                visited["home"] = d
        out = ""
        # if none found (likely due to corrupt or empty data file) add "none" to the dictionary
        for o in visited:
            out += o + ", "
        if out == "":
            out = "none, "
        return out[0:len(out) - 2]
    else:
        print(key)
        return UNKNOWN_CATEGORY
    
rounded_location_df['loc_category'] = rounded_location_df.swifter.progress_bar(True).apply(add_loc_category, axis=1)

In [None]:
#
bar = plot_bar(rounded_location_df, 'sias', 'Location SIAS Scores Distribution', save=True)
bar.show()
bar = plot_bar(rounded_location_df,
               'loc_category', 
               'Location Category Distribution', save=True, sideways=True)
bar.show()

In [None]:
# do one hot via pd.get_dummies
rounded_location_df['loc_category'] = pd.Categorical(rounded_location_df['loc_category'])
cat_dummies = pd.get_dummies(rounded_location_df['loc_category'], prefix="cat")
rounded_location_df = pd.concat([rounded_location_df, cat_dummies], axis=1)

In [None]:
#
rounded_location_df['loc_category'].value_counts(normalize=True)

In [None]:
# convert lat and lng back to floats for classifier. will result in some precision lost, hopefully not much
rounded_location_df['f_longitude'] = rounded_location_df['longitude'].astype('float')
rounded_location_df['f_latitude'] = rounded_location_df['latitude'].astype('float')

In [None]:
#
rounded_location_df['sias'].var()
rounded_location_df['sias'].std()
rounded_location_df['sias'].mean()
# the reason these are dropped is because original did not use them. perhaps can show with more data?
graph_correlation('Location Data ', rounded_location_df.drop(columns=['uid','loc_category', 'latitude', 'longitude', 'systemTimestamp', 'f_longitude', 'f_latitude', 'speed']), save=True, rotation=90, figsize=(10,7))

In [None]:
#
location_tree = DecisionTreeClassifier()

sm = SMOTE(sampling_strategy='auto')

location_train_set, location_test_set, location_train_labels, location_test_labels = train_test_split(
    rounded_location_df.drop(columns=['uid','loc_category', 'latitude', 'longitude', 'systemTimestamp', 'f_longitude', 'f_latitude', 'speed']),
    rounded_location_df['sias'],
    test_size=TEST_SIZE,
    shuffle=True)

# uncomment the line below to perform oversampling. change the file name accordingly
# location_train_set, location_train_labels = sm.fit_resample(location_train_set, location_train_labels)

x = location_tree.fit(location_train_set.drop(columns='sias'), location_train_labels)

y_predict = location_tree.predict(location_test_set.drop(columns='sias'))
y_predict_proba = location_tree.predict_proba(location_test_set.drop(columns='sias'))

curve = plot_roc("Decision Tree - Locations", location_test_labels, y_predict_proba, save=True)
curve.show()

evaluation_summary("Decision Tree - Locations", y_predict, location_test_labels, location_tree, location_train_set, save=True, sideways=True)

title = "Decision Tree - Locations"
plt_cv = StratifiedKFold(n_splits=30, shuffle=True)
curve = plot_learning_curve(DecisionTreeClassifier(), title, location_train_set.drop(columns='sias'), location_train_labels, ylim=(0.4, 1.01), cv=plt_cv, n_jobs=-1, save=True)
curve.show()

### Manipulating data

In [None]:
# convert session data string to actual python object
session_df['session_data'] = session_df.session_data.swifter.progress_bar(True).apply(convert_session_data_list)

In [None]:
# add sias score to each session
session_df['sias'] = session_df.uid.swifter.progress_bar(True).apply(add_sias_score)

In [None]:
#
session_df['session_start_ts'] = pd.to_datetime(session_df['session_start'], unit='ms')
session_df['session_end_ts'] = pd.to_datetime(session_df['session_end'], unit='ms')

### Combining the data into one dataframe (flatten and expand the sessions)

In [None]:
#
combined_data = pd.DataFrame()

def combine(row):
    global combined_data
    sias = row['sias']
    session_start = int(row['session_start'])
    session_end = int(row['session_end'])
    session_length = int(session_end - session_start)
    
    frame = pd.DataFrame.from_records(row['session_data'])
    frame['sessionStart'] = session_start
    frame['sessionEnd'] = session_end
    frame['sessionLength'] = session_length
    frame['sias'] = sias
    combined_data = pd.concat([combined_data, frame], axis=0, ignore_index=True)
    
    
session_df.swifter.progress_bar(True).apply(combine, axis=1)


In [None]:
# remove system as these are mainly junk apps
combined_data = combined_data[combined_data['category'] != 'System']

In [None]:
#
combined_data['sias'].mean()
combined_data['sias'].var()
combined_data['sias'].std()
combined_data['sias'].value_counts(normalize=True)
graph_correlation(
    'Session Data ',
    combined_data.drop(columns=['sessionStart', 'sessionEnd', 'lastTimeUsed']),
    save=False, rotation=30
)

In [None]:
#
bar = plot_bar(combined_data, 'sias', 'Application Session SIAS Scores Distribution', save=False)
bar.show()

n = combined_data['category'].value_counts(normalize=True)
pdn = pd.DataFrame(n)
pdn = pdn.reset_index()
pdn.head(5)

plt.figure()
plt.title('App Categories Among All Users')
sns.barplot(x="category", y="index", data=pdn)
plt.ylabel('App Category')
plt.xlabel('Percentage')
plt.tight_layout()
plt.savefig('bar_chart_Application Session Category Distribution.pdf', dpi=300, bbox_inches='tight')

### Splitting up data into train/test

In [None]:
#
dropped_columns=['sessionStart', 'sessionEnd', 'lastTimeUsed', 'sias']
print(combined_data.drop(columns=dropped_columns[:-1]).columns)
combined_data = combined_data.sample(frac=1)

X_train_data, X_test_data, y_train_labels, y_test_labels = train_test_split(
    combined_data.drop(columns=dropped_columns[:-1])[:], combined_data['sias'], test_size=TEST_SIZE, shuffle=True)

to_drop = ['sias','name', 'category']

### Pipeline setup with the current features and an empty classifier

In [None]:
#
pipeline = Pipeline([
    ('union', FeatureUnion(
      transformer_list=[
          ('totalTimeInForeground', Pipeline([
              ('selector', ItemSelector(key='totalTimeInForeground')),
              ('sparse', SparseTranspose())
          ])),
          ('sessionLength', Pipeline([
              ('selector', ItemSelector(key='sessionLength')),
              ('sparse', SparseTranspose())
          ])),
      ])),
    ('clf', None)
])

### Standard sklearn DecisionTreeClassifier

In [None]:
#
c = DecisionTreeClassifier(max_depth=None)
fitted = pipeline.set_params(clf=c).fit(X_train_data.drop(columns=to_drop), y_train_labels)

y_predict = c.predict(X_test_data.drop(columns=to_drop))
y_predict_proba = c.predict_proba(X_test_data.drop(columns=to_drop))

title = "Decision Tree - Sessions"
plt_cv = StratifiedKFold(n_splits=10, shuffle=False)
curve = plot_learning_curve(DecisionTreeClassifier(), title, X_train_data.drop(columns=to_drop), y_train_labels, ylim=(0.6, 1.01), cv=plt_cv, n_jobs=-1, save=True)
curve.show()

curve = plot_roc(title, y_test_labels, y_predict_proba, save=False)
curve.show()

evaluation_summary(title, y_predict, y_test_labels, c, X_train_data, save=False, drop=to_drop)

In [None]:
#
t = combined_data[combined_data['sias']==0]
n = t['category'].value_counts(normalize=True)
pdn = pd.DataFrame(n)
pdn = pdn.reset_index()
pdn.head(5)

plt.figure()
plt.title('App Categories Among Non-Socially Anxious Users')
sns.barplot(x="category", y="index", data=pdn)
plt.ylabel('App Category')
plt.xlabel('Percentage')
plt.tight_layout()
plt.savefig('categories_among_non_anxious.pdf', dpi=300, bbox_inches='tight')


t = combined_data[combined_data['sias']==1]
n = t['category'].value_counts(normalize=True)
pdn = pd.DataFrame(n)
pdn = pdn.reset_index()
pdn.head(5)

plt.figure()
plt.title('App Categories Among Socially Anxious Users')
sns.barplot(x="category", y="index", data=pdn)
plt.ylabel('App Category')
plt.xlabel('Percentage')
plt.tight_layout()
plt.savefig('categories_among_anxious.pdf', dpi=300, bbox_inches='tight')


In [None]:
# for socially anxious
def session_length_sec(row):
    return (row['session_end_ts'] - row['session_start_ts']).seconds

session_df['ts_duration'] = session_df.swifter.progress_bar(True).apply(session_length_sec, axis=1)

In [None]:
# for non socially anxious
x = combined_data[combined_data['sias']==0]
x = x[x['category']=='Productivity']
x['totalTimeInForeground'].mean()
x = combined_data[combined_data['sias']==0]
x = x[x['category']=='Social & Communication']
x['totalTimeInForeground'].mean()
print('----------')
# for socially anxious
x = combined_data[combined_data['sias']==1]
x = x[x['category']=='Productivity']
x['totalTimeInForeground'].mean()
x = combined_data[combined_data['sias']==1]
x = x[x['category']=='Social & Communication']
x['totalTimeInForeground'].mean()


### Ensemble classifiers (randomforest, gradientboosting, extraforests)

In [None]:
#
extra_trees = ExtraTreesClassifier()

fitted = pipeline.set_params(clf=extra_trees).fit(X_train_data.drop(columns=to_drop), y_train_labels)
y_predict = extra_trees.predict(X_test_data.drop(columns=to_drop))
y_predict_proba = extra_trees.predict_proba(X_test_data.drop(columns=to_drop))


title = 'Ensemble Extra Trees - Sessions'
plt_cv = StratifiedKFold(n_splits=10, shuffle=True)
curve = plot_learning_curve(extra_trees, title, X_train_data.drop(columns=to_drop), y_train_labels, ylim=(0.5, 1.01), cv=plt_cv, n_jobs=-1, save=True)
curve.show()

curve = plot_roc(title, y_test_labels, y_predict_proba, save=True)
curve.show()

evaluation_summary(title, y_predict, y_test_labels, extra_trees, X_train_data, save=True, drop=to_drop)

In [None]:
#
random_forest = RandomForestClassifier(max_depth=None)

fitted = pipeline.set_params(clf=random_forest).fit(X_train_data.drop(columns=to_drop), y_train_labels)
y_predict = random_forest.predict(X_test_data.drop(columns=to_drop))
y_predict_proba = random_forest.predict_proba(X_test_data.drop(columns=to_drop))

title = 'Ensemble Random Forest - Sessions'
plt_cv = StratifiedKFold(n_splits=10, shuffle=True)
curve = plot_learning_curve(random_forest, title, X_train_data.drop(columns=to_drop), y_train_labels, ylim=(0.5, 1.01), cv=plt_cv, n_jobs=-1, save=True)
curve.show()

curve = plot_roc(title, y_test_labels, y_predict_proba, save=True)
curve.show()

evaluation_summary(title, y_predict, y_test_labels, random_forest, X_train_data, save=True, drop=to_drop)