In [None]:
pip install pandas scikit-learn joblib




In [None]:
!pip install xgboost




In [None]:
import torch
print("CUDA Available:", torch.cuda.is_available())
print("GPU Name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")


CUDA Available: False
GPU Name: No GPU


In [None]:
import xgboost as xgb

params = {
    'tree_method': 'gpu_hist',   # Enables GPU
    'predictor': 'gpu_predictor',
    'objective': 'binary:logistic'
}


PreProcessing

In [None]:
# Step 1: Install and import PySpark (if in Colab)
!pip install -q pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import Imputer, StringIndexer, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# Step 2: Start Spark session
spark = SparkSession.builder.appName("PDC_Preprocessing").getOrCreate()

# Step 3: Load the CSV file
df = spark.read.csv("/content/pdc_dataset_with_target.csv", header=True, inferSchema=True)

# OPTIONAL: Check schema before transformation
print("Before casting:")
df.printSchema()

# Step 4: Cast columns to appropriate types - don't convert string columns to double
for col_name in df.columns:
    if col_name != "target" and str(df.schema[col_name].dataType) != 'StringType':
        df = df.withColumn(col_name, col(col_name).cast("double"))

# Confirm schema
print("After casting:")
df.printSchema()

# Step 5: Identify numeric and categorical columns
numeric_cols = [field.name for field in df.schema.fields if str(field.dataType) == 'DoubleType' and field.name != 'target']
categorical_cols = [field.name for field in df.schema.fields if str(field.dataType) == 'StringType']

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

# Step 6: Build preprocessing stages
pipeline_stages = []

# --- Impute numeric columns with mean ---
if numeric_cols:
    imputer = Imputer(inputCols=numeric_cols, outputCols=[c + "_imputed" for c in numeric_cols])
    pipeline_stages.append(imputer)
    imputed_cols = [c + "_imputed" for c in numeric_cols]
else:
    imputed_cols = []

# --- Encode categorical columns using StringIndexer ---
if categorical_cols:
    indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed", handleInvalid="keep") for col in categorical_cols]
    pipeline_stages += indexers
    indexed_cat_cols = [col + "_indexed" for col in categorical_cols]
else:
    indexed_cat_cols = []

# --- Assemble all features ---
if imputed_cols or indexed_cat_cols:  # Only assemble if we have features
    assembler = VectorAssembler(
        inputCols=imputed_cols + indexed_cat_cols,
        outputCol="features_unscaled"
    )
    pipeline_stages.append(assembler)

    # --- Standardize features ---
    scaler = StandardScaler(inputCol="features_unscaled", outputCol="features", withMean=True, withStd=True)
    pipeline_stages.append(scaler)

    # Step 7: Build and apply pipeline
    pipeline = Pipeline(stages=pipeline_stages)
    model = pipeline.fit(df)
    processed_df = model.transform(df)

    # Step 8: Final dataset with features and target
    final_df = processed_df.select("features", "target")

    # Show a few rows to verify
    final_df.show(5, truncate=False)
else:
    print("Warning: No features were created - check your input data")

Before casting:
root
 |-- feature_1: double (nullable = true)
 |-- feature_2: double (nullable = true)
 |-- feature_3: string (nullable = true)
 |-- feature_4: double (nullable = true)
 |-- feature_5: string (nullable = true)
 |-- feature_6: integer (nullable = true)
 |-- feature_7: double (nullable = true)
 |-- target: integer (nullable = true)

After casting:
root
 |-- feature_1: double (nullable = true)
 |-- feature_2: double (nullable = true)
 |-- feature_3: double (nullable = true)
 |-- feature_4: double (nullable = true)
 |-- feature_5: double (nullable = true)
 |-- feature_6: double (nullable = true)
 |-- feature_7: double (nullable = true)
 |-- target: integer (nullable = true)

Numeric columns: []
Categorical columns: []


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from joblib import Parallel, delayed

# Load dataset
df = pd.read_csv("/content/pdc_dataset_with_target.csv")

# Split types
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.drop('target')
cat_cols = df.select_dtypes(include=['object']).columns

# --- Step 1: Handle Missing Values in Parallel ---

def impute_column(col):
    if df[col].dtype in ['float64', 'int64']:
        imputer = SimpleImputer(strategy='mean')
    else:
        imputer = SimpleImputer(strategy='most_frequent')
    df[col] = imputer.fit_transform(df[[col]])
    return df[col]

# Parallel imputation
_ = Parallel(n_jobs=-1)(
    delayed(impute_column)(col) for col in df.columns if df[col].isnull().sum() > 0
)

# --- Step 2: Encode Categorical Variables ---

for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# --- Step 3: Normalize Numerical Features ---

scaler = StandardScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

# --- Final Dataset ---
X = df.drop('target', axis=1)
y = df['target']

# Output shapes
print("✅ Preprocessing done.")
print("Feature shape:", X.shape)
print("Target shape:", y.shape)


✅ Preprocessing done.
Feature shape: (41000, 7)
Target shape: (41000,)


In [None]:
import pandas as pd


data = pd.read_csv('/content/pdc_dataset_with_target.csv')
print(data.dtypes)

data['feature_6'] = data['feature_6'].astype('int32')
data['feature_1'] = data['feature_1'].astype('float32')
data['feature_2'] = data['feature_2'].astype('float32')
data['feature_4'] = data['feature_4'].astype('float32')
data['feature_7'] = data['feature_7'].astype('float32')
data['target'] = data['target'].astype('int32')

print(data.dtypes)

feature_1    float64
feature_2    float64
feature_3     object
feature_4    float64
feature_5     object
feature_6      int64
feature_7    float64
target         int64
dtype: object
feature_1    float32
feature_2    float32
feature_3     object
feature_4    float32
feature_5     object
feature_6      int32
feature_7    float32
target         int32
dtype: object


In [None]:
print(data.isnull().sum())

feature_1    2054
feature_2    2050
feature_3       0
feature_4    2054
feature_5       0
feature_6       0
feature_7    2036
target          0
dtype: int64


In [None]:
# Fixing all missing values without chained assignment

data['feature_1'] = data['feature_1'].astype(float)
data['feature_1'] = data['feature_1'].fillna(data['feature_1'].mean())

data['feature_2'] = pd.to_numeric(data['feature_2'], errors='coerce')
data['feature_2'] = data['feature_2'].fillna(data['feature_2'].mean())

data['feature_4'] = pd.to_numeric(data['feature_4'], errors='coerce')
data['feature_4'] = data['feature_4'].fillna(data['feature_4'].mean())

data['feature_7'] = pd.to_numeric(data['feature_7'], errors='coerce')
data['feature_7'] = data['feature_7'].fillna(data['feature_7'].mean())

print(data.isnull().sum())


feature_1    0
feature_2    0
feature_3    0
feature_4    0
feature_5    0
feature_6    0
feature_7    0
target       0
dtype: int64


In [None]:
data.to_csv('Preprocessed.csv', index=False)
