## Performing Data Preprocessing Techniques to Titanic Dataset

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

url = 'https://github.com/datasciencedojo/datasets/raw/master/titanic.csv'
df = pd.read_csv(url)

df.head(10)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [11]:
df.describe(include="all")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


In [12]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [13]:
mean_age = df['Age'].mean()

df['Age_mean_imputed'] = df['Age'].fillna(mean_age)

In [15]:
median_age = df['Age'].median()
df['Age_median_imputed'] = df['Age'].fillna(median_age)

In [16]:
mode_embarked = df['Embarked'].mode()[0]

df['Embarked_mode_imputed'] = df['Embarked'].fillna(mode_embarked) 


In [17]:
from sklearn.impute import KNNImputer
import numpy as np

numerical_features = ['Age', 'Fare', 'Parch', 'SibSp']
df_num = df[numerical_features]

knn_imputer = KNNImputer(n_neighbors=5)
df_num_imputed = knn_imputer.fit_transform(df_num)

df_knn_imputed = pd.DataFrame(df_num_imputed, columns=numerical_features)

df['Age_knn_imputed'] = df_knn_imputed['Age']


In [21]:
missing_age_rows = df[df['Age'].isna()]
missing_age_rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_mean_imputed,Age_median_imputed,Embarked_mode_imputed,Age_knn_imputed
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,29.699118,28.0,Q,25.0
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,29.699118,28.0,S,24.2
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,29.699118,28.0,C,30.5
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,29.699118,28.0,C,30.5
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,29.699118,28.0,Q,29.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,29.699118,28.0,C,24.8
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S,29.699118,28.0,S,37.0
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S,29.699118,28.0,S,25.0
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,29.699118,28.0,S,31.6


In [22]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

mice_imputer = IterativeImputer(random_state=42)

df_mice_imputed = mice_imputer.fit_transform(df_num)

df_mice_imputed = pd.DataFrame(df_mice_imputed, columns=numerical_features)

df['Age_mice_imputed'] = df_mice_imputed['Age']

In [23]:
missing_age_rows = df[df['Age'].isna()]
missing_age_rows

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_mean_imputed,Age_median_imputed,Embarked_mode_imputed,Age_knn_imputed,Age_mice_imputed
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,29.699118,28.0,Q,25.0,31.593748
17,18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0000,,S,29.699118,28.0,S,24.2,31.789372
19,20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.2250,,C,29.699118,28.0,C,30.5,31.540626
26,27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.2250,,C,29.699118,28.0,C,30.5,31.540626
28,29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q,29.699118,28.0,Q,29.6,31.568804
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C,29.699118,28.0,C,24.8,31.540807
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S,29.699118,28.0,S,37.0,-4.125982
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S,29.699118,28.0,S,25.0,31.638617
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S,29.699118,28.0,S,31.6,31.569520


### 4.1 Identifying Outliers

In [28]:
# z-score
from scipy.stats import zscore

df['Fare_zscore'] = zscore(df['Fare'])

outliers_zscore = df[np.abs(df['Fare_zscore'])>3]

In [29]:
outliers_zscore

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_mean_imputed,Age_median_imputed,Embarked_mode_imputed,Age_knn_imputed,Age_mice_imputed,Fare_zscore
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S,19.0,19.0,S,19.0,19.0,4.647001
88,89,1,1,"Fortune, Miss. Mabel Helen",female,23.0,3,2,19950,263.0,C23 C25 C27,S,23.0,23.0,S,23.0,23.0,4.647001
118,119,0,1,"Baxter, Mr. Quigg Edmond",male,24.0,0,1,PC 17558,247.5208,B58 B60,C,24.0,24.0,C,24.0,24.0,4.335332
258,259,1,1,"Ward, Miss. Anna",female,35.0,0,0,PC 17755,512.3292,,C,35.0,35.0,C,35.0,35.0,9.667167
299,300,1,1,"Baxter, Mrs. James (Helene DeLaudeniere Chaput)",female,50.0,0,1,PC 17558,247.5208,B58 B60,C,50.0,50.0,C,50.0,50.0,4.335332
311,312,1,1,"Ryerson, Miss. Emily Borie",female,18.0,2,2,PC 17608,262.375,B57 B59 B63 B66,C,18.0,18.0,C,18.0,18.0,4.634417
341,342,1,1,"Fortune, Miss. Alice Elizabeth",female,24.0,3,2,19950,263.0,C23 C25 C27,S,24.0,24.0,S,24.0,24.0,4.647001
377,378,0,1,"Widener, Mr. Harry Elkins",male,27.0,0,2,113503,211.5,C82,C,27.0,27.0,C,27.0,27.0,3.610065
380,381,1,1,"Bidois, Miss. Rosalie",female,42.0,0,0,PC 17757,227.525,,C,42.0,42.0,C,42.0,42.0,3.932723
438,439,0,1,"Fortune, Mr. Mark",male,64.0,1,4,19950,263.0,C23 C25 C27,S,64.0,64.0,S,64.0,64.0,4.647001


In [31]:
# Interquartile Range (IQR) Method
Q1 = df['Fare'].quantile(0.25)
Q3 = df['Fare'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 - 1.5 * IQR

outliers_iqr = df[(df['Fare'] < lower_bound) | (df['Fare']> upper_bound)]

outliers_iqr

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_mean_imputed,Age_median_imputed,Embarked_mode_imputed,Age_knn_imputed,Age_mice_imputed,Fare_zscore
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S,22.000000,22.0,S,22.0,22.00000,-0.502445
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.000000,38.0,C,38.0,38.00000,0.786845
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S,26.000000,26.0,S,26.0,26.00000,-0.488854
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,35.000000,35.0,S,35.0,35.00000,0.420730
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S,35.000000,35.0,S,35.0,35.00000,-0.486337
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S,27.000000,27.0,S,27.0,27.00000,-0.386671
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,19.000000,19.0,S,19.0,19.00000,-0.044381
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S,29.699118,28.0,S,26.8,24.17135,-0.176263
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C,26.000000,26.0,C,26.0,26.00000,-0.044381


### 4.2.2 Transformation

In [None]:
# By log transformation
df['Fare_log'] = np.log1p(df['Fare'])

In [34]:
# Cap and Flooring
df['Fare_capped'] = np.where(df['Fare'] > upper_bound, upper_bound, np.where(df['Fare'] < lower_bound, lower_bound, df['Fare']))


In [35]:
df.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_mean_imputed,Age_median_imputed,Embarked_mode_imputed,Age_knn_imputed,Age_mice_imputed,Fare_zscore,Fare_log,Fare_capped
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,22.0,22.0,S,22.0,22.0,-0.502445,2.110213,-3.6344
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,38.0,38.0,C,38.0,38.0,0.786845,4.280593,-3.6344
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,26.0,26.0,S,26.0,26.0,-0.488854,2.188856,-3.6344
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,35.0,35.0,S,35.0,35.0,0.42073,3.990834,-3.6344
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,35.0,35.0,S,35.0,35.0,-0.486337,2.202765,-3.6344
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q,29.699118,28.0,Q,25.0,31.593748,-0.478116,2.246893,-3.6344
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,54.0,54.0,S,54.0,54.0,0.395814,3.967694,-3.6344
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S,2.0,2.0,S,2.0,2.0,-0.224083,3.094446,-3.6344
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S,27.0,27.0,S,27.0,27.0,-0.424256,2.495954,-3.6344
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C,14.0,14.0,C,14.0,14.0,-0.042956,3.436268,-3.6344


## 5: Data Normalization and Standardization

### 5.1 Normalization (Min-Max Scaling)

In [38]:
from sklearn.preprocessing import MinMaxScaler

min_max_scaler = MinMaxScaler()

df['Fare_min_max_scaled'] = min_max_scaler.fit_transform(df[["Fare"]])

df[['Fare', 'Fare_min_max_scaled']].head(10)

Unnamed: 0,Fare,Fare_min_max_scaled
0,7.25,0.014151
1,71.2833,0.139136
2,7.925,0.015469
3,53.1,0.103644
4,8.05,0.015713
5,8.4583,0.01651
6,51.8625,0.101229
7,21.075,0.041136
8,11.1333,0.021731
9,30.0708,0.058694


### 5.2 Standardization (Z-Score Scaling)

In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

df['Fare_standardized'] = scaler.fit_transform(df[['Fare']])

df[["Fare", "Fare_standardized"]].head()

Unnamed: 0,Fare,Fare_standardized
0,7.25,-0.502445
1,71.2833,0.786845
2,7.925,-0.488854
3,53.1,0.42073
4,8.05,-0.486337


### 5.3 Robust Scaling

In [40]:
from sklearn.preprocessing import RobustScaler

robust_scaler = RobustScaler()
df['Fare_robust_scaled'] = robust_scaler.fit_transform(df[['Fare']])

df[["Fare", "Fare_robust_scaled"]].head(10)

Unnamed: 0,Fare,Fare_robust_scaled
0,7.25,-0.312011
1,71.2833,2.461242
2,7.925,-0.282777
3,53.1,1.673732
4,8.05,-0.277363
5,8.4583,-0.25968
6,51.8625,1.620136
7,21.075,0.286744
8,11.1333,-0.143827
9,30.0708,0.676348


## 6. Feature Engineering

### Creting new features


In [42]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

df[['SibSp', 'Parch', 'FamilySize']].head(10)

Unnamed: 0,SibSp,Parch,FamilySize
0,1,0,2
1,1,0,2
2,0,0,1
3,1,0,2
4,0,0,1
5,0,0,1
6,0,0,1
7,3,1,5
8,0,2,3
9,1,0,2


### Transformations

In [43]:
# Log Transformations
df['Fare_log'] = np.log1p(df['Fare'])

df[['Fare', 'Fare_log']].head(10)

Unnamed: 0,Fare,Fare_log
0,7.25,2.110213
1,71.2833,4.280593
2,7.925,2.188856
3,53.1,3.990834
4,8.05,2.202765
5,8.4583,2.246893
6,51.8625,3.967694
7,21.075,3.094446
8,11.1333,2.495954
9,30.0708,3.436268


In [47]:
# Polynomial Transformation for non linear relationships

from sklearn.preprocessing import PolynomialFeatures

# Handle missing values by imputing with the mean
df[['Age', 'Fare']] = df[['Age', 'Fare']].fillna(df[['Age', 'Fare']].mean())

poly = PolynomialFeatures(degree=2)
poly_features = poly.fit_transform(df[["Age", "Fare"]])

df_poly = pd.DataFrame(poly_features, columns=poly.get_feature_names_out(["Age", "Fare"]))

df_poly.head(10)


Unnamed: 0,1,Age,Fare,Age^2,Age Fare,Fare^2
0,1.0,22.0,7.25,484.0,159.5,52.5625
1,1.0,38.0,71.2833,1444.0,2708.7654,5081.308859
2,1.0,26.0,7.925,676.0,206.05,62.805625
3,1.0,35.0,53.1,1225.0,1858.5,2819.61
4,1.0,35.0,8.05,1225.0,281.75,64.8025
5,1.0,29.699118,8.4583,882.037589,251.204047,71.542839
6,1.0,54.0,51.8625,2916.0,2800.575,2689.718906
7,1.0,2.0,21.075,4.0,42.15,444.155625
8,1.0,27.0,11.1333,729.0,300.5991,123.950369
9,1.0,14.0,30.0708,196.0,420.9912,904.253013


### Encoding Categorical Variables


In [51]:
# One hot encode
df_encoded = pd.get_dummies(df, columns=["Embarked"], drop_first=True)

df_encoded.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Age_mice_imputed,Fare_zscore,Fare_log,Fare_capped,Fare_min_max_scaled,Fare_standardized,Fare_robust_scaled,FamilySize,Embarked_Q,Embarked_S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,22.0,-0.502445,2.110213,-3.6344,0.014151,-0.502445,-0.312011,2,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,38.0,0.786845,4.280593,-3.6344,0.139136,0.786845,2.461242,2,False,False
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,26.0,-0.488854,2.188856,-3.6344,0.015469,-0.488854,-0.282777,1,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,35.0,0.42073,3.990834,-3.6344,0.103644,0.42073,1.673732,2,False,True
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,35.0,-0.486337,2.202765,-3.6344,0.015713,-0.486337,-0.277363,1,False,True
5,6,0,3,"Moran, Mr. James",male,29.699118,0,0,330877,8.4583,...,31.593748,-0.478116,2.246893,-3.6344,0.01651,-0.478116,-0.25968,1,True,False
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,...,54.0,0.395814,3.967694,-3.6344,0.101229,0.395814,1.620136,1,False,True
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,...,2.0,-0.224083,3.094446,-3.6344,0.041136,-0.224083,0.286744,5,False,True
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,...,27.0,-0.424256,2.495954,-3.6344,0.021731,-0.424256,-0.143827,3,False,True
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,...,14.0,-0.042956,3.436268,-3.6344,0.058694,-0.042956,0.676348,2,False,False


In [52]:
# label encoding
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df['Sex_encoded'] = label_encoder.fit_transform(df['Sex'])

df[['Sex', 'Sex_encoded']].head(10)

Unnamed: 0,Sex,Sex_encoded
0,male,1
1,female,0
2,female,0
3,female,0
4,male,1
5,male,1
6,male,1
7,male,1
8,female,0
9,female,0


In [54]:
# correlation the correlation matrix
correlation_matrix = df.select_dtypes(include=['int64', 'float64']).corr()
correlation_matrix['Survived'].sort_values(ascending=False)

Survived               1.000000
Fare_log               0.329862
Fare_robust_scaled     0.257307
Fare_standardized      0.257307
Fare_zscore            0.257307
Fare                   0.257307
Fare_min_max_scaled    0.257307
Parch                  0.081629
FamilySize             0.016639
PassengerId           -0.005007
SibSp                 -0.035322
Age_mice_imputed      -0.055297
Age_knn_imputed       -0.064346
Age_median_imputed    -0.064910
Age_mean_imputed      -0.069809
Age                   -0.069809
Pclass                -0.338481
Sex_encoded           -0.543351
Fare_capped                 NaN
Name: Survived, dtype: float64

In [60]:
# Variance Threshold

from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold(threshold=0.1)
df_var = selector.fit_transform(df.select_dtypes(include= [np.number]))
df_var.shape

(891, 17)

## 7.2 Wrapper Methods

### Recursive Feature Elimination (RFE):

In [63]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)  # Increase max_iter to allow more iterations
rfe = RFE(estimator=model, n_features_to_select=5)
fit = rfe.fit(df.select_dtypes(include=[np.number]), df['Survived'])

print("Selected Features: ", fit.support_)
print('Feature Ranking: ', fit.ranking_)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Selected Features:  [False  True  True False  True False False False False False False False
  True False False False False False  True]
Feature Ranking:  [14  1  1  5  1  6 12  3  4  9  7 13  1 10 15 11  8  2  1]


In [66]:
df.select_dtypes(include=[np.number])

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Age_mean_imputed,Age_median_imputed,Age_knn_imputed,Age_mice_imputed,Fare_zscore,Fare_log,Fare_capped,Fare_min_max_scaled,Fare_standardized,Fare_robust_scaled,FamilySize,Sex_encoded
0,1,0,3,22.000000,1,0,7.2500,22.000000,22.0,22.0,22.00000,-0.502445,2.110213,-3.6344,0.014151,-0.502445,-0.312011,2,1
1,2,1,1,38.000000,1,0,71.2833,38.000000,38.0,38.0,38.00000,0.786845,4.280593,-3.6344,0.139136,0.786845,2.461242,2,0
2,3,1,3,26.000000,0,0,7.9250,26.000000,26.0,26.0,26.00000,-0.488854,2.188856,-3.6344,0.015469,-0.488854,-0.282777,1,0
3,4,1,1,35.000000,1,0,53.1000,35.000000,35.0,35.0,35.00000,0.420730,3.990834,-3.6344,0.103644,0.420730,1.673732,2,0
4,5,0,3,35.000000,0,0,8.0500,35.000000,35.0,35.0,35.00000,-0.486337,2.202765,-3.6344,0.015713,-0.486337,-0.277363,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,27.000000,0,0,13.0000,27.000000,27.0,27.0,27.00000,-0.386671,2.639057,-3.6344,0.025374,-0.386671,-0.062981,1,1
887,888,1,1,19.000000,0,0,30.0000,19.000000,19.0,19.0,19.00000,-0.044381,3.433987,-3.6344,0.058556,-0.044381,0.673281,1,0
888,889,0,3,29.699118,1,2,23.4500,29.699118,28.0,26.8,24.17135,-0.176263,3.196630,-3.6344,0.045771,-0.176263,0.389604,4,0
889,890,1,1,26.000000,0,0,30.0000,26.000000,26.0,26.0,26.00000,-0.044381,3.433987,-3.6344,0.058556,-0.044381,0.673281,1,1


### Embedded Methods


In [67]:
# Lasso Regression
from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.01)

lasso.fit(df.select_dtypes(include= [np.number]), df['Survived'])

print("Lasso Coefficients: ", lasso.coef_)

Lasso Coefficients:  [-3.88829705e-07  9.54570341e-01 -0.00000000e+00 -5.92115572e-05
 -0.00000000e+00  0.00000000e+00  1.12796040e-04 -3.94745175e-05
 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00]


In [1]:
def zscore(data):
    # Step 1: Calculate the mean
    mean = sum(data) / len(data)
    
    # Step 2: Calculate the standard deviation
    variance = sum((x - mean) ** 2 for x in data) / len(data)
    std_dev = variance ** 0.5
    
    # Step 3: Calculate Z-Score for each data point
    z_scores = [(x - mean) / std_dev for x in data]
    
    return z_scores

# Example Usage
data = [10, 15, 20, 25, 30]
print("Original Data:", data)
print("Z-Scores:", zscore(data))

Original Data: [10, 15, 20, 25, 30]
Z-Scores: [-1.414213562373095, -0.7071067811865475, 0.0, 0.7071067811865475, 1.414213562373095]
