# Scikit Learn Tutorial
    - We have Data (Split into two parts)
        - X (Features / Inputs)
        - Y (Labels / Outputs)
    - Then we give "X" and "Y" to ML mode

In [240]:
from operator import index
from unittest.mock import inplace

from sklearn import datasets
iris_dataset = datasets.load_iris()


- iris_data sends a dictionary containing fields
    ['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module']
- X = iris_dataset['data']   -> Input fields (features)
- y = iris_dataset['target'] -> Output fields (Labels)

In [241]:
X = iris_dataset['data']
y = iris_dataset['target']

### Now we need to give this data to ML modes (Linear Regression)
    - ML models are simply Python Objects

In [242]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X,y)
model.predict(X)

In [243]:
from sklearn.neighbors import KNeighborsClassifier
model1 = KNeighborsClassifier()
model1.fit(X,y)
model1.predict(X)

In [244]:
import matplotlib.pyplot as plt
y_pred = model1.predict(X)
plt.scatter(y_pred,y)
plt.show()

## Data cleaning and Data Preprocessing
    - To maintain Data Constsistency
    - So ML models will not

In [245]:
import pandas as pd
from sklearn.datasets import fetch_openml

'''
    fetch_openml -> return dictionary objects
    as_frame = True -> means return as a Dataframe Object
'''

df = fetch_openml('titanic',version=1,as_frame=True)
x = df['data']
y = df['target']

checking_null = x.isnull().sum()
checking_null

### Data Visualization of null values

In [246]:
import seaborn as sns
sns.set(style='dark')
# checking_null.plot(kind='bar',title='Null Values',ylabel='Percentage')

checking_null = checking_null.to_frame(name="Nulls")
checking_null['Names'] = checking_null.index
sns.barplot(x="Nulls",y='Names',data=checking_null)

### As u can see Body has maximum Nulls
    - So we might need to remove 'body' column
    -- but it's not optimized approach , by removing we r removing factor that may contribute to our outcome

-- So For that we need to use Value Imputation

In [247]:
df = pd.concat([x,y],axis=1) # sklearn bunch object to dataframe
print("Shape Before Remove",df.shape)
df.drop(['body'] , axis = 1 , inplace=True)
print('Shape After Remove',df.shape)

### So we will Use " Value Imputation "
    - We will use SimpleImputer from sklearn
    - It will replace missing values with some statistics calculated from other values in a column
    - Used : Mean , Median , Mode

In [248]:
from sklearn.impute import SimpleImputer
import numpy as np

print("Number of Null Values in Age Before : ",df['age'].isnull().sum())

# It means jaha null dikhha waha pe mean place kr dega
imp = SimpleImputer(strategy='mean')

# imputer takes Dataframe or 2d array
# and in place of null values it will place mean
df['age'] = imp.fit_transform(df[['age']])

print("Number of Null Values in Age After : ",df['age'].isnull().sum())
print('Means is : ',np.mean(df['age']))

In [249]:
def get_parameters(df):

    parameters = {}

    for column in df.columns:
        if df[column].dtype in ['int64','float64']:
            strategy = 'mean'
        else:
            strategy = 'most_frequent'

        missing_values = 'Nan' if df[column].isnull().sum() > 0 else 'None'
        parameters[column] = {'missing_values': int(df[column].isnull().sum()) , 'strategy':strategy}

    return parameters

parameters = get_parameters(df)
parameters

In [250]:
for column , parameter in parameters.items():
    check_null_sums = parameter['missing_values']
    imp = SimpleImputer(strategy=parameter['strategy'])
    df[column] = imp.fit_transform(df[[column]]).ravel()

df.isnull().sum()
# print(df.isnull().sum()) # check imputed or not


### Feature Engineering
    🚀 Feature Engineering Techniques
    ✅ Missing Value Handling (Mean, Median, Mode)
    ✅ Categorical Encoding (One-Hot, Label Encoding)
    ✅ Feature Scaling (Normalization, Standardization)
    ✅ Feature Creation (New features like family_size, is_alone)
    ✅ Feature Selection (Remove irrelevant or correlated features)

In [251]:
df.head()

In [252]:
df['family'] = df['sibsp'] + df['parch']

df.loc[df['family'] > 0 , 'travel_alone'] = 0
df.loc[df['family'] == 0 , 'travel_alone'] = 1
sns.set()
df['travel_alone'].value_counts().plot(kind='bar')

### Data Encoding
    - Its Simply Converts Categorical (string) into number like format
    - so ML model understands
    - Ex : SEX - [male,female] -> for ML we converts = SEX - [1,0]


In [254]:
from sklearn.preprocessing import OneHotEncoder
encode = OneHotEncoder()

# creates new columns female and male and ,
# uses fit_transform() and return array so the each array will place at ,
# specific column
# Ex : [ [1,0].[0,1].[0,1].[1,0] ] then
# female -> arr[1,0,0,1] and male -> arr[0,1,1,0]

# toarrray returns 2d numpy array
# 2d numpy array automatically place column 0 as female and column 1 as male
# fit_transform usually encode categorical value into numeric on basis of sex columm
df[['female','male']] = encode.fit_transform(df[['sex']]).toarray()
df[['sex','female','male']]


In [272]:
# returns 2d numpy array
arr = encode.fit_transform(df[['sex']]).toarray()

# all values from column 0 to sex column
df['sex'] = arr[:,0]

df.head()

### Data Scaling
    - if the data in any condition has data points far from each other,
    - Scaling is a technique to make them closer to each other
    - Data scaling is done so that no feature dominates the model just because of its larger
      numerical values.
    - StandardScaler -> Standardize features by removing the mean and scaling to unit variance
    - MinMaxScaler -> Scale features to a given range

In [284]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

'''
    StandardScaler -> Standardizes features by removing the mean and scaling to unit variance.
    Formula: X' = (X - mean) / standard deviation
    - Centers the data around mean = 0
    - Scales features so that standard deviation = 1
    - Useful when data follows a normal distribution.
'''

# Select numerical columns (returns a Pandas Index object, which is immutable)
nums_cols = df.select_dtypes(include=['int64', 'float64']).columns
print(nums_cols)  # List of numerical column names

# Initialize StandardScaler
std_scaler = StandardScaler()

# Apply StandardScaler to selected numerical columns
df[nums_cols] = std_scaler.fit_transform(df[nums_cols])

# Display first 5 rows after Standard Scaling
df[nums_cols].head()

'''
    MinMaxScaler -> Scales features to a fixed range (default: 0 to 1)
    Formula: X' = (X - X_min) / (X_max - X_min)
    - Ensures all features are within a specified range (default: 0 to 1)
    - Useful when data does not follow a normal distribution.
'''

# Select numerical columns again (to apply MinMax scaling)
nums_cols1 = df.select_dtypes(include=['int64', 'float64']).columns

# Initialize MinMaxScaler
mnmx_scaler = MinMaxScaler()

# Apply MinMaxScaler to selected numerical columns
df[nums_cols1] = mnmx_scaler.fit_transform(df[nums_cols1])

# Display transformed data
df[nums_cols1]
