In [1]:
import pandas as pd
import mysql.connector
import re
from sys import exit

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np

%load_ext autoreload
%autoreload 2

# globals
IGNORE_ANDROID_10 = True  # toggle to use extra android 10 data feature points or not
ANDROID_10_FEATURES = ('lastTimeForegroundServiceUsed', 'lastTimeVisible',
                      'totalTimeForegroundServiceUsed', 'totalTimeVisible')  # the android 10 data feature points
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', None)

In [2]:
# read data from mariadb table
def read_data(com):
    con = 'mysql+mysqlconnector://admin:password@127.0.0.1:3306/Dissertation'
    return pd.read_sql_table(com, con=con)

# load data
call_df = read_data('calls')
user_df = read_data('user')
category_df = read_data('app_categories')
location_df = read_data('locations')
session_df = read_data('user_session_data')

# remove the id column that comes from the database. isn't necessary
call_df = call_df.drop(columns='id')
location_df = location_df.drop(columns='id')
session_df = session_df.drop(columns='id')

In [3]:
class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.    """

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
#         print(data_dict)
#         print(self.key)
        return data_dict[self.key]
    

class SparseTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
  
    def transform(self, X):
        csr = csr_matrix(X)
#         print(csr)
        return csr


class TransposeTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
  
    def transform(self, X):
        return X.transpose().toarray()


In [4]:
# label encoder
category_encoder = LabelEncoder()
category_labels = category_encoder.fit_transform(category_df['category'])
category_mappings = {label: index for index, label in enumerate(category_encoder.classes_)}

# one hot it
category_one_hot = OneHotEncoder()
category_feature = category_one_hot.fit_transform(category_df[['category']]).toarray()
category_feature_labels = list(category_encoder.classes_)

app_encoder = LabelEncoder()
app_encoder_labels = app_encoder.fit_transform(category_df['app_name'])
app_mappings = {label: index for index, label in enumerate(app_encoder.classes_)}

In [10]:
# data converters
def convert_session_app_data(string):
#     takes in a dictionary as a string and convert it to dictionary
    obj = dict()
    pattern = re.compile(r'[\w]+=[\w ]+')
    matches = pattern.findall(string)
    name = ''
    for match in matches:
        split_match = match.split('=')
        if split_match[0] == 'name':  # name is the only key that does not have an integer value
#             obj[split_match[0]] = app_mappings.get(split_match[1])
            name = split_match[1]
        elif split_match[0] == 'category':
            obj[split_match][0] = int(category_mappings.get(split_match[1], -1))
        else:
            if IGNORE_ANDROID_10 and split_match[0] in ANDROID_10_FEATURES:
                continue
            obj[split_match[0]] = int(split_match[1])
#     print(name)
    if name != '':
        try:
            obj['category'] = int(category_mappings.get(category_df[category_df['app_name']==name]['category'].values[0], -1))
        except:
            obj['category'] = int(-1)
    else:
        obj['category'] = int(-1)
    return obj


def convert_session_data_list(session_data):
#     convert list of dictionary strings to list of proper dictionary objects
    obj = list()
    pattern = re.compile(r'(\{[A-Za-z0-9_=, ]+\})')
    matches = pattern.findall(session_data)
    for match in matches:
        obj.append(convert_session_app_data(match))
    return obj


# data addition functions
def add_sias_score(uid):
    return int(user_df.loc[user_df['uid'] == uid].values[0][1])


def add_category(session_data):
#     for each dictionary in the list, find the package and category for that app name, append to dict, and save new dict to list
    updated_data = list()
    for item in session_data:
        app_name = item.get('name', None)
        if app_name is None:
            print('app name is none for data: ' + item)
            continue  # shouldn't happen but just in case
        
        app = category_df.loc[category_df['app_name'] == app_name].values
        app_category = None
        
        # TODO: fix this once all data is collected. the problem is due to not all new data added yet
        try:
            app_category = app[0][1]
        except IndexError as e:
            continue
        
        item['app_category'] = category_mappings.get(app_category, app_category)
        updated_data.append(item)
    
    return updated_data

    
# TODO: add location latitude and longitude that pertains to that session


# add total session length
def add_session_length(row):
    return int(row['session_end']) - int(row['session_start'])


# add social quantifier
def add_social_quantifier(sias):
    if sias >= 43:
        return 2
    elif sias <= 33:
        return 0
    else:
        return 1


In [11]:
# convert session data string to actual python object
session_df['session_data'] = session_df.session_data.apply(convert_session_data_list)

In [12]:
# add session length
# session_df['session_length'] = session_df.apply(add_session_length, axis=1)
# # add category and package name to each app in each session object
# session_df['session_data'] = session_df.session_data.apply(add_category)
# # add sias score to each session
session_df['sias'] = session_df.uid.apply(add_sias_score)
# # add string quantifier
# session_df['level'] = session_df.sias.apply(add_social_quantifier)

In [19]:
# combine all that shit and flatten the living fuck out of it
# uid, sias, app name, category, session_interval, app_total_time_used, app
# combined_data = pd.DataFrame(columns=['uid', 'sias', 'app_name', 'category', 'session_interval', 'last_time_used', 'total_time_foreground'])
combined_data = pd.DataFrame()

def combine(row):
    global combined_data
    sias = row['sias']
    uid = row['uid']
    session_start = int(row['session_start'])
    session_end = int(row['session_end'])
    session_interval = pd.Interval(left=int(row['session_start']), right=int(row['session_end']), closed='both')
    for session in row['session_data']:
#         app_name = session.get('name')
        last_time_used = session.get('lastTimeUsed')
        total_time_in_foreground = session.get('totalTimeInForeground')
        app_category = session.get('category')
        frame = pd.DataFrame([uid, sias, app_category, session_start, session_end, last_time_used, total_time_in_foreground, session_interval])
        combined_data = pd.concat([combined_data, frame], axis=1, ignore_index=True)
#         print(frame)
#         break
    
session_df.apply(combine, axis=1)
combined_data = combined_data.T
combined_data = combined_data.rename(columns={
    0: 'uid',
    1: 'sias',
#     2: 'app_name',
    2: 'category',
    3: 'session_start',
    4: 'session_end',
    5: 'last_time_used',
    6: 'total_time_foreground',
    7: 'session_interval'
})

combined_data['uid'] = combined_data['uid'].astype('string')
combined_data['sias'] = combined_data['sias'].astype('int')
combined_data['session_interval'] = combined_data['session_interval'].astype('interval')
combined_data['category'] = combined_data['category'].astype('int')
combined_data['session_start'] = combined_data['session_start'].astype('int')
combined_data['session_end'] = combined_data['session_end'].astype('int')
combined_data['last_time_used'] = combined_data['last_time_used'].astype('int')
combined_data['total_time_foreground'] = combined_data['total_time_foreground'].astype('int')

combined_data = combined_data.drop(columns='uid')

In [20]:
combined_data.head(1)

Unnamed: 0,sias,category,session_start,session_end,last_time_used,total_time_foreground,session_interval
0,44,21,1582906131929,1582906535913,1582839465286,659,"[1582906131929, 1582906535913]"


In [23]:
# random_threads = combined_data.sample(frac=1)

# validation_split = int(len(random_threads) * 0.8)
# train_data = random_threads.iloc[:validation_split, :]
# validation_data = random_threads.iloc[validation_split:, :]

# train_labels = train_data[['sias']]

# validation_labels = validation_data['sias']

# session_train = train_data[['category', 'last_time_used', 'total_time_foreground']]
session_train, session_test, train_labels, test_labels = train_test_split(combined_data, combined_data['sias'], test_size=0.3, shuffle=True)

In [26]:
# c = DecisionTreeClassifier()
# c = c.fit(session_train, train_labels)

from sklearn.linear_model import LogisticRegression

pipeline = Pipeline([
    ('union', FeatureUnion(
      transformer_list=[
          ('category', Pipeline([
              ('selector', ItemSelector(key='category')),
              ('sparse', SparseTransformer()),
              ('transform', TransposeTransformer()),
              ('onehot', OneHotEncoder())
          ])),
          ('session_length', Pipeline([
              ('selector', ItemSelector(key='session_interval')),
#               ('sparse', SparseTransformer()),
#               ('transform', TransposeTransformer()),
#               ('onehot', OneHotEncoder())
          ])),
      ])),
    ('tree', DecisionTreeClassifier())
])

result = pipeline.fit(session_train, train_labels)

ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,1].shape[0] == 1, expected 10497.