# Phase 2 - Data preprocessing
### Authors: Karolina Skrypova(50%), Oleh Fedunchyk(50%)

## Importing libraries

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

## Loading the data

In [33]:
connections_data = pd.read_csv('https://raw.githubusercontent.com/myrres0/IAU-2024/main/dataset-120/connections.csv', sep='\t')
processes_data = pd.read_csv('https://raw.githubusercontent.com/myrres0/IAU-2024/main/dataset-120/processes.csv', sep='\t')

## Datasets for processing

In [34]:
merged_data = pd.merge(processes_data, connections_data)

### Data cleaning

In [35]:
class OutlierRemover(TransformerMixin):
    def __init__(self):
        self.imputer = SimpleImputer(strategy='median')

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = self.resolve_missing_values(X)
        X_ = X.drop_duplicates()
        return X_.apply(self.transform_outliers)

    # we do not have NA values in our dataset, but let this step be in our pipeline
    def resolve_missing_values(self, X):
        return pd.DataFrame(self.imputer.fit_transform(X), columns=X.columns)

    # we have checked mrwa have no outliers so can also be proceeded by this function
    def transform_outliers(self, column):
      Q1 = column.quantile(0.25)
      Q3 = column.quantile(0.75)
      IQR = Q3 - Q1
      lower_bound = Q1 - 1.5 * IQR
      upper_bound = Q3 + 1.5 * IQR

      outliers = column[(column < lower_bound) | (column > upper_bound)]
      cleaned_column = column.clip(lower=lower_bound, upper=upper_bound)

      return cleaned_column

### Data transforming

In [36]:
class PowerTransformerGroup(TransformerMixin):
    def __init__(self):
        self.power_transformer = PowerTransformer(method='yeo-johnson', standardize=True)

    def fit(self, X, y=None):
        self.power_transformer.fit(X)
        return self

    def transform(self, X):
        return pd.DataFrame(self.power_transformer.transform(X), columns=X.columns)

### Data scaling

In [37]:
class StandardScalerGroup(TransformerMixin):
    def __init__(self):
        self.standard_scaler = StandardScaler()

    def fit(self, X, y=None):
        self.standard_scaler.fit(X)
        return self

    def transform(self, X):
        return pd.DataFrame(self.standard_scaler.transform(X), columns=X.columns)

### Pipeline

Our constants:

In [38]:
selected_attributes = [
    "mwra",
    "p.android.externalstorage",
    "p.android.settings",
    "p.android.gm",
    "p.system",
    "p.android.packageinstaller",
    "c.android.gm",
    "c.android.youtube",
    "p.android.chrome",
    "c.android.chrome"
]

In [39]:
def divide_df(df):
  mwra = df['mwra']
  df.drop('mwra', axis=1)
  return train_test_split(df, mwra, test_size=0.2)

train_data, test_data, mwra_train, mwra_test = divide_df(merged_data[selected_attributes])

In [41]:
pipeline = Pipeline([
    ('outlier_remover', OutlierRemover()),
    ('power_transformer', PowerTransformerGroup()),
    ('standard_scaler', StandardScalerGroup())
]).fit(train_data, mwra_train)