In [16]:
# -*- coding: utf-8 -*-
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
# implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

### Adam Candrák/Mária Matušisková - 50%/50%

# Imports

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, PowerTransformer, MinMaxScaler

## Global variables

In [18]:
target_column = 'mwra'
test_size = 0.3
random_state = 42

In [19]:
connections_file = "../data/Connections.csv"
processes_file = "../data/Processes.csv"

connections = pd.read_csv(connections_file, sep='\t')
processes = pd.read_csv(processes_file, sep='\t')

Change the names of the columns

In [20]:
c_connections = connections.rename(columns={
    "c.katana": "facebook",
    "c.android.chrome": "chrome",
    "c.android.gm": "gmail",
    "c.dogalize": "dogalize",
    "c.android.youtube": "youtube",
    "c.updateassist": "updateassist",
    "c.UCMobile.intl": "UCMobile.intl",
    "c.raider": "raider",
    "c.android.vending": "vending",
    "c.UCMobile.x86": "UCMobile.x86",
})

p_processes = processes.rename(columns={
    "p.katana": "facebook",
    "p.android.chrome": "chrome",
    "p.android.gm": "gmail",
    "p.dogalize": "dogalize",
    "p.android.vending": "vending",
    "p.android.packageinstaller": "packageinstaller",
    "p.system": "system",
    "p.android.documentsui": "documentsui",
    "p.android.settings": "settings",
    "p.android.externalstorage": "externalstorage",
    "p.android.defcontainer": "defcontainer",
    "p.inputmethod.latin": "inputmethod.latin",
    "p.process.gapps": "gapps",
    "p.simulator": "simulator",
    "p.android.gms": "google mobile services (gms)",
    "p.google": "google",
    "p.olauncher": "olauncher",
    "p.browser.provider": "browser provider",
    "p.notifier": "notifier",
    "p.gms.persistent": "gms.persistent",
})

Change the type of timestamp to int64 of connections' dataset:

In [21]:
c_connections['ts'] = pd.to_datetime(c_connections['ts']).astype(np.int64)

Change the type of timestamp to int64 of processes dataset:

In [22]:
p_processes['ts'] = pd.to_datetime(p_processes['ts']).astype(np.int64)

### Merge datasets connections and processes:

In [23]:
merged_dataset = pd.merge(c_connections, p_processes, on=['imei', 'ts', 'mwra'])

### Check for missing values and duplicities

In [24]:
has_nan = merged_dataset.isnull().values.any()

if has_nan:
    print("The dataset has NaN values.")
else:
    print("No NaN values found in the dataset.")

No NaN values found in the dataset.


In [25]:
has_duplicity = merged_dataset.duplicated().any()

if has_duplicity:
    print("The dataset has duplicity values.")
    merged_dataset.drop_duplicates(inplace=True)
else:
    print("No duplicity values found in the dataset.")

The dataset has duplicity values.


### Drop values which are not helpful for further training:

In [26]:
merged_dataset.drop('ts', axis=1, inplace=True)
merged_dataset.drop('imei', axis=1, inplace=True)

### Outlier deletion

In [27]:
# Source: https://www.kaggle.com/code/marcinrutecki/outlier-detection-methods

def StandardDevDetection(data, n, columns):

    outliers_inx = []
    lower = 0
    upper = 0

    for column in columns:
        # Calculate mean and standard derivation of each column
        data_mean, data_std = mean(data[column], axis=0), std(data[column], axis=0)
        # print('column=', column, 'len=', len(data), 'mean=', data_mean, 'std=', data_std)

        # Divide it to the three outliers in the standard deviations:
        cut_off = data_std * 3
        lower, upper = data_mean - cut_off, data_mean + cut_off
        # print('column=', column, 'cutoff=', cut_off, 'lower=', lower, 'upper=', upper)

        # Filter the dataframe:
        outliers = data[(data[column] < lower) | (data[column] > upper)].index
        # print('Identified outliers:', len(outliers))

        outliers_inx.extend(outliers)

    outliers_inx = Counter(outliers_inx)
    multiple_outliers = list( k for k, v in outliers_inx.items() if v > n )

    data_uppper = data[data[column] > upper]
    data_lower = data[data[column] < lower]
    # print('Total number of outliers is:', data_uppper.shape[0] + data_lower.shape[0])

    return multiple_outliers


columns = merged_dataset.columns
result = StandardDevDetection(merged_dataset, 1, columns)

new_dataset = merged_dataset.drop(result, axis = 0).reset_index(drop=True)

## Data splitting

In [28]:

mwra = new_dataset[target_column]

data = new_dataset.drop(columns=[target_column], axis=1)

train_data, test_data, train_mwra, test_mwra = train_test_split(data, mwra, test_size=test_size, random_state=random_state)

In [29]:
# features selected from our previous analysis
selected_features = ['gmail_x', 'gapps', 'facebook_x', 'chrome_x', 'vending_x',
                     'youtube', 'dogalize_x', 'updateassist', 'UCMobile.intl']

# global variable to store the preprocessor so we can use it later
preprocessor = None

# this class was created with help from Claude.ai
# we were unsure of how to work with pipeline
class DataPreprocessor:
    def __init__(self):
        self.pipeline = None

    def create_pipeline(self):
        numeric_pipeline = Pipeline([
            ('standard_scaler', StandardScaler()),
            ('power_transform', PowerTransformer(method='yeo-johnson')),
            ('minmax_scaler', MinMaxScaler()),
            ('feature_select', SelectKBest(score_func=f_classif, k=len(selected_features)))
        ])

        self.pipeline = ColumnTransformer(
            transformers=[
                ('numeric', numeric_pipeline, selected_features)
            ],
            remainder='drop'  # drop any columns not specified in the transformers
        )

        return self

    def fit_transform(self, X, y=None):
        if self.pipeline is None:
            self.create_pipeline()
        return self.pipeline.fit_transform(X, y)

    def transform(self, X):
        if self.pipeline is None:
            raise ValueError("Pipeline has not been fitted yet. Call fit_transform first.")
        return self.pipeline.transform(X)

def process_data(train_data, test_data, train_mwra=None):
    global preprocessor
    preprocessor = DataPreprocessor()

    X_train_processed = preprocessor.fit_transform(train_data, train_mwra)
    X_test_processed = preprocessor.transform(test_data)

    return X_train_processed, X_test_processed

def process_with_preprocessor(preprocessor, data):
    return preprocessor.transform(data)

In [30]:
X_train_processed, X_test_processed = process_data(train_data, test_data, train_mwra)

X_train_processed_df = pd.DataFrame(X_train_processed,  columns=selected_features)
X_test_processed_df = pd.DataFrame(X_test_processed,  columns=selected_features)

X_train_processed_df.to_csv('dataset_df_train.csv', index=False)
X_test_processed_df.to_csv('dataset_df_test.csv', index=False)

# create dataframes for the target column (aka label)
y_train_df = pd.DataFrame(train_mwra, columns=[target_column])
y_test_df = pd.DataFrame(test_mwra, columns=[target_column])

print("Processed training data shape:", X_train_processed_df.shape)
print("Processed test data shape:", X_test_processed_df.shape)

Processed training data shape: (10402, 9)
Processed test data shape: (4458, 9)
