In [12]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from scipy import stats
import numpy as np



In [3]:
# Load the dataset (using Iris dataset as an example)
from sklearn.datasets import load_iris
iris = load_iris()
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
data['target'] = iris.target


In [None]:
# Check the data info
print("Data Info:")
print(data.info())

In [5]:
# Splitting the dataset into Training, Validation, and Test sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.25, random_state=42)

In [6]:
# Separate independent features and Target Variables
X_train = train_data.drop('target', axis=1)
y_train = train_data['target']

X_val = val_data.drop('target', axis=1)
y_val = val_data['target']

X_test = test_data.drop('target', axis=1)
y_test = test_data['target']

In [7]:
# Count the Missing values
print("\nMissing Values:")
print(data.isnull().sum())



Missing Values:
sepal length (cm)    0
sepal width (cm)     0
petal length (cm)    0
petal width (cm)     0
target               0
dtype: int64


In [8]:
# Dealing with the Missing values using SimpleImputer method
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(X_test)

In [9]:
# Show the Categorical Features (For this example, there are none as Iris dataset has numeric features)
categorical_features = []
print("\nCategorical Features:")
print(categorical_features)


Categorical Features:
[]


In [13]:
# Check the outliers
outliers_zscore = (np.abs(stats.zscore(data.drop('target', axis=1))) > 3)
outliers_indices = np.where(outliers_zscore)
print("\nOutliers Indices (Z-Score Method):")
print(outliers_indices)



Outliers Indices (Z-Score Method):
(array([15]), array([1]))


In [14]:
data_no_outliers = data.drop(data.index[outliers_indices[0]])

In [15]:
scaler_minmax = MinMaxScaler()
X_train_minmax = scaler_minmax.fit_transform(X_train_imputed)
X_val_minmax = scaler_minmax.transform(X_val_imputed)
X_test_minmax = scaler_minmax.transform(X_test_imputed)

scaler_zscore = StandardScaler()
X_train_zscore = scaler_zscore.fit_transform(X_train_imputed)
X_val_zscore = scaler_zscore.transform(X_val_imputed)
X_test_zscore = scaler_zscore.transform(X_test_imputed)


In [19]:
# Statistical Analysis
print("\nStatistical Analysis:")
print("Mean:\n", np.mean(X_train_imputed, axis=0))
print("Median:\n", np.median(X_train_imputed, axis=0))
print("Variance:\n", np.var(X_train_imputed, axis=0))
print("Standard Deviation:\n", np.std(X_train_imputed, axis=0))




Statistical Analysis:
Mean:
 [5.84666667 3.11222222 3.72777778 1.18888889]
Median:
 [5.8 3.  4.3 1.3]
Variance:
 [0.6896     0.21107284 3.19600617 0.58565432]
Standard Deviation:
 [0.83042158 0.45942664 1.78773772 0.76528055]


In [21]:
# Correlation Analysis
correlation_matrix = np.corrcoef(X_train_imputed, rowvar=False)

print("\nCorrelation Matrix:")
print(correlation_matrix)



Correlation Matrix:
[[ 1.         -0.13808394  0.86267306  0.79248668]
 [-0.13808394  1.         -0.47903831 -0.41076151]
 [ 0.86267306 -0.47903831  1.          0.95798695]
 [ 0.79248668 -0.41076151  0.95798695  1.        ]]
