<a href="https://colab.research.google.com/github/omar-omar-om/gradProject-notebooks/blob/main/encoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import os
import json
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from google.colab import drive

#  Reading and inspecting dataset

In [None]:
# Mount Google Drive
drive.mount('/content/drive')

# Define dataset path
dataset_path = '/content/drive/My Drive/output.csv'
df = pd.read_csv(dataset_path)

# Preview the dataset
print(df.head())
print(df.shape)


Mounted at /content/drive
  EngineVersion       AppVersion  AvSigVersion  RtpStateBitfield  \
0   1.1.15100.1  4.18.1807.18075  1.273.1735.0               7.0   
1   1.1.14600.4     4.13.17134.1    1.263.48.0               7.0   
2   1.1.15100.1  4.18.1807.18075  1.273.1341.0               7.0   
3   1.1.15100.1  4.18.1807.18075  1.273.1527.0               7.0   
4   1.1.15100.1  4.18.1807.18075  1.273.1379.0               7.0   

   IsSxsPassiveMode  DefaultBrowsersIdentifier  AVProductStatesIdentifier  \
0                 0                        0.0                    53447.0   
1                 0                        0.0                    53447.0   
2                 0                        0.0                    53447.0   
3                 0                        0.0                    53447.0   
4                 0                        0.0                    53447.0   

   AVProductsInstalled  AVProductsEnabled  CountryIdentifier  ...  \
0                  1.0           

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2578105 entries, 0 to 2578104
Data columns (total 62 columns):
 #   Column                                             Dtype  
---  ------                                             -----  
 0   EngineVersion                                      object 
 1   AppVersion                                         object 
 2   AvSigVersion                                       object 
 3   RtpStateBitfield                                   float64
 4   IsSxsPassiveMode                                   int64  
 5   DefaultBrowsersIdentifier                          float64
 6   AVProductStatesIdentifier                          float64
 7   AVProductsInstalled                                float64
 8   AVProductsEnabled                                  float64
 9   CountryIdentifier                                  int64  
 10  CityIdentifier                                     float64
 11  OrganizationIdentifier                            

In [None]:
# Select only non-numeric columns from the DataFrame
non_numeric_cols = df.select_dtypes(exclude=['number'])

# Calculate the cardinality (number of unique values) for each non-numeric column
cardinality = non_numeric_cols.nunique()

#  display the results in a DataFrame format
cardinality_df = cardinality.reset_index()
cardinality_df.columns = ['Column', 'Cardinality']

print(cardinality_df)

                              Column  Cardinality
0                      EngineVersion           51
1                         AppVersion           84
2                       AvSigVersion         1182
3               OsPlatformSubRelease            9
4                         OsBuildLab          241
5                         SkuEdition            7
6                        SmartScreen           10
7              Census_MDC2FormFactor           11
8         Census_PrimaryDiskTypeName            4
9             Census_ChassisTypeName           34
10      Census_PowerPlatformRoleName            9
11        Census_InternalBatteryType           11
12                  Census_OSVersion          236
13             Census_OSArchitecture            3
14                   Census_OSBranch           15
15                  Census_OSEdition           19
16          Census_OSInstallTypeName            9
17  Census_OSWUAutoUpdateOptionsName            5
18           Census_GenuineStateName            4


# Mount Google Drive & Set Paths



In [None]:


# Mount Google Drive
drive.mount('/content/drive')

# Define file paths
base_path = "/content/drive/My Drive/"
paths = {
    "frequency": os.path.join(base_path, "frequency-encoding"),
    "target": os.path.join(base_path, "target-encoding"),
    "hybrid": os.path.join(base_path, "hybrid-label-target-encoding"),
    "label": os.path.join(base_path, "label-encoding"),
    "label_freq": os.path.join(base_path, "label-frequency-encoding"),
}

# Ensure directories exist
for path in paths.values():
    os.makedirs(path, exist_ok=True)

print("Google Drive mounted and paths created.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted and paths created.


# Load Dataset



In [None]:

# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
target = "HasDetections"

print(f"Dataset loaded with {df.shape[0]} rows and {df.shape[1]} columns.")
print(f" Identified {len(categorical_cols)} categorical columns.")


Dataset loaded with 2578105 rows and 62 columns.
 Identified 21 categorical columns.


# Define Encoding Functions



In [None]:


def frequency_encoding(df, col):
    """Applies Frequency Encoding to a column and returns the mapping."""
    freq_map = df[col].value_counts(normalize=True).to_dict()
    df[col] = df[col].map(freq_map)
    return freq_map

def target_encoding(df, col, target):
    """Applies Target Encoding to a column and returns the mapping."""
    target_map = df.groupby(col)[target].mean().to_dict()
    df[col] = df[col].map(target_map)
    return target_map

print("Encoding functions defined.")


Encoding functions defined.


# Initialize Encoding Storage & Dataset Copies


In [None]:


# Store mappings
freq_mappings = {}
target_mappings = {}
label_mappings = {}

# Create copies of dataset for different encodings
df_freq = df.copy()
df_target = df.copy()
df_hybrid = df.copy()
df_label = df.copy()
df_label_freq = df.copy()

print("Encoding storage initialized and dataset copies created.")


Encoding storage initialized and dataset copies created.


# Apply Label Encoding First (Ensures Consistency)



In [None]:
le = LabelEncoder()

for col in categorical_cols:
    # Apply Label Encoding to all datasets
    df_label[col] = le.fit_transform(df[col])
    df_hybrid[col] = le.transform(df[col])
    df_label_freq[col] = le.transform(df[col])

    # Store Label Encoding mappings
    label_mappings[col] = {label: idx for idx, label in enumerate(le.classes_)}

print("Label Encoding applied consistently across all datasets.")


Label Encoding applied consistently across all datasets.


# Apply Frequency & Target Encoding



In [None]:
for col in categorical_cols:
    # Apply Frequency Encoding
    freq_mappings[col] = frequency_encoding(df_freq, col)
    frequency_encoding(df_label_freq, col)  # Apply frequency encoding to Label-Frequency dataset

    # Apply Target Encoding
    target_mappings[col] = target_encoding(df_target, col, target)

print("Frequency and Target Encoding applied.")


Frequency and Target Encoding applied.


# Save Mappings (Same Label Mappings in All Folders)



In [None]:
for encoding_type in paths:
    with open(os.path.join(paths[encoding_type], "label_mappings.json"), "w") as f:
        json.dump(label_mappings, f)

with open(os.path.join(paths["frequency"], "freq_mappings.json"), "w") as f:
    json.dump(freq_mappings, f)

with open(os.path.join(paths["target"], "target_mappings.pkl"), "wb") as f:
    pickle.dump(target_mappings, f)

print(" Encoding mappings saved in respective folders.")


 Encoding mappings saved in respective folders.


#Split Data into Train, Validation, Test Sets



In [None]:
def split_data(df):
    """Splits data into Train, Validation, and Test sets."""
    train, temp = train_test_split(df, test_size=0.2, random_state=42, stratify=df[target])
    val, test = train_test_split(temp, test_size=0.5, random_state=42, stratify=temp[target])
    return train, val, test

# Apply splitting to each encoded dataset
datasets = {
    "frequency": split_data(df_freq),
    "target": split_data(df_target),
    "hybrid": split_data(df_hybrid),
    "label": split_data(df_label),
    "label_freq": split_data(df_label_freq),
}

print("Data successfully split into Train, Validation, and Test sets.")


Data successfully split into Train, Validation, and Test sets.


# Save Split Datasets in Respective Folders



In [None]:
for encoding_type, (train, val, test) in datasets.items():
    train.to_csv(os.path.join(paths[encoding_type], f"train_{encoding_type}.csv"), index=False)
    val.to_csv(os.path.join(paths[encoding_type], f"val_{encoding_type}.csv"), index=False)
    test.to_csv(os.path.join(paths[encoding_type], f"test_{encoding_type}.csv"), index=False)

print(" All encoded datasets saved in respective folders.")


 All encoded datasets saved in respective folders.
