In [5]:
!pip install scikit-learn


Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl.metadata (13 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp310-cp310-macosx_12_0_arm64.whl (11.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.0/11.0 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.2 threadpoolctl-3.5.0


In [6]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.cluster import KMeans
import scipy.stats as stats

In [2]:
df1 = pd.read_excel('student_bucket.xlsx')
df1

Unnamed: 0,Student_id,Age,Grade,Employed,marks
0,1,19,1st Class,yes,29
1,2,20,2nd Class,no,41
2,3,18,1st Class,no,57
3,4,21,2nd Class,no,29
4,5,19,1st Class,no,57
...,...,...,...,...,...
227,228,21,1st Class,no,42
228,229,20,2nd Class,no,47
229,230,20,3rd Class,yes,21
230,231,19,1st Class,yes,64


In [5]:
def bin_with_function(data, bin_width):
    min_val = data.min()
    max_val = data.max()
    bins = np.arange(min_val, max_val + bin_width, bin_width)
    return pd.cut(data, bins=bins, include_lowest=True)

df1['age_bin_function'] = bin_with_function(df1['Age'], 5)
df1

Unnamed: 0,Student_id,Age,Grade,Employed,marks,age_bin_function
0,1,19,1st Class,yes,29,"(17.999, 23.0]"
1,2,20,2nd Class,no,41,"(17.999, 23.0]"
2,3,18,1st Class,no,57,"(17.999, 23.0]"
3,4,21,2nd Class,no,29,"(17.999, 23.0]"
4,5,19,1st Class,no,57,"(17.999, 23.0]"
...,...,...,...,...,...,...
227,228,21,1st Class,no,42,"(17.999, 23.0]"
228,229,20,2nd Class,no,47,"(17.999, 23.0]"
229,230,20,3rd Class,yes,21,"(17.999, 23.0]"
230,231,19,1st Class,yes,64,"(17.999, 23.0]"


In [12]:
df1['age_bin_function'].head(30)

0     (17.999, 23.0]
1     (17.999, 23.0]
2     (17.999, 23.0]
3     (17.999, 23.0]
4     (17.999, 23.0]
5     (17.999, 23.0]
6     (17.999, 23.0]
7     (17.999, 23.0]
8     (17.999, 23.0]
9     (17.999, 23.0]
10    (17.999, 23.0]
11    (17.999, 23.0]
12    (17.999, 23.0]
13    (17.999, 23.0]
14    (17.999, 23.0]
15    (17.999, 23.0]
16    (17.999, 23.0]
17    (17.999, 23.0]
18    (17.999, 23.0]
19    (17.999, 23.0]
20    (17.999, 23.0]
21    (17.999, 23.0]
22    (17.999, 23.0]
23    (17.999, 23.0]
24    (17.999, 23.0]
25    (17.999, 23.0]
26    (17.999, 23.0]
27    (17.999, 23.0]
28    (17.999, 23.0]
29    (17.999, 23.0]
Name: age_bin_function, dtype: category
Categories (1, interval[float64, right]): [(17.999, 23.0]]

In [8]:
df1['age_bin_user'] = pd.cut(df1['Age'], bins=[10, 15, 20, 25], include_lowest=True)
df1

Unnamed: 0,Student_id,Age,Grade,Employed,marks,age_bin_function,age_bin_user
0,1,19,1st Class,yes,29,"(17.999, 23.0]","(15.0, 20.0]"
1,2,20,2nd Class,no,41,"(17.999, 23.0]","(15.0, 20.0]"
2,3,18,1st Class,no,57,"(17.999, 23.0]","(15.0, 20.0]"
3,4,21,2nd Class,no,29,"(17.999, 23.0]","(20.0, 25.0]"
4,5,19,1st Class,no,57,"(17.999, 23.0]","(15.0, 20.0]"
...,...,...,...,...,...,...,...
227,228,21,1st Class,no,42,"(17.999, 23.0]","(20.0, 25.0]"
228,229,20,2nd Class,no,47,"(17.999, 23.0]","(15.0, 20.0]"
229,230,20,3rd Class,yes,21,"(17.999, 23.0]","(15.0, 20.0]"
230,231,19,1st Class,yes,64,"(17.999, 23.0]","(15.0, 20.0]"


In [14]:
df1['age_bin_equal_width'] = pd.cut(df1['Age'], bins=3)

# Variable Discretization using equal frequency discretization
df1['age_bin_equal_freq'] = pd.qcut(df1['Age'], q=3)

In [15]:
df1

Unnamed: 0,Student_id,Age,Grade,Employed,marks,age_bin_function,age_bin_user,age_bin_equal_width,age_bin_equal_freq
0,1,19,1st Class,yes,29,"(17.999, 23.0]","(15.0, 20.0]","(17.996, 19.333]","(17.999, 19.0]"
1,2,20,2nd Class,no,41,"(17.999, 23.0]","(15.0, 20.0]","(19.333, 20.667]","(19.0, 20.0]"
2,3,18,1st Class,no,57,"(17.999, 23.0]","(15.0, 20.0]","(17.996, 19.333]","(17.999, 19.0]"
3,4,21,2nd Class,no,29,"(17.999, 23.0]","(20.0, 25.0]","(20.667, 22.0]","(20.0, 22.0]"
4,5,19,1st Class,no,57,"(17.999, 23.0]","(15.0, 20.0]","(17.996, 19.333]","(17.999, 19.0]"
...,...,...,...,...,...,...,...,...,...
227,228,21,1st Class,no,42,"(17.999, 23.0]","(20.0, 25.0]","(20.667, 22.0]","(20.0, 22.0]"
228,229,20,2nd Class,no,47,"(17.999, 23.0]","(15.0, 20.0]","(19.333, 20.667]","(19.0, 20.0]"
229,230,20,3rd Class,yes,21,"(17.999, 23.0]","(15.0, 20.0]","(19.333, 20.667]","(19.0, 20.0]"
230,231,19,1st Class,yes,64,"(17.999, 23.0]","(15.0, 20.0]","(17.996, 19.333]","(17.999, 19.0]"


In [17]:
kmeans = KMeans(n_clusters=3)
df1['age_bin_kmeans'] = kmeans.fit_predict(df1[['Age']])

  super()._check_params_vs_input(X, default_n_init=10)


In [18]:
df1

Unnamed: 0,Student_id,Age,Grade,Employed,marks,age_bin_function,age_bin_user,age_bin_equal_width,age_bin_equal_freq,age_bin_kmeans
0,1,19,1st Class,yes,29,"(17.999, 23.0]","(15.0, 20.0]","(17.996, 19.333]","(17.999, 19.0]",1
1,2,20,2nd Class,no,41,"(17.999, 23.0]","(15.0, 20.0]","(19.333, 20.667]","(19.0, 20.0]",0
2,3,18,1st Class,no,57,"(17.999, 23.0]","(15.0, 20.0]","(17.996, 19.333]","(17.999, 19.0]",1
3,4,21,2nd Class,no,29,"(17.999, 23.0]","(20.0, 25.0]","(20.667, 22.0]","(20.0, 22.0]",2
4,5,19,1st Class,no,57,"(17.999, 23.0]","(15.0, 20.0]","(17.996, 19.333]","(17.999, 19.0]",1
...,...,...,...,...,...,...,...,...,...,...
227,228,21,1st Class,no,42,"(17.999, 23.0]","(20.0, 25.0]","(20.667, 22.0]","(20.0, 22.0]",2
228,229,20,2nd Class,no,47,"(17.999, 23.0]","(15.0, 20.0]","(19.333, 20.667]","(19.0, 20.0]",0
229,230,20,3rd Class,yes,21,"(17.999, 23.0]","(15.0, 20.0]","(19.333, 20.667]","(19.0, 20.0]",0
230,231,19,1st Class,yes,64,"(17.999, 23.0]","(15.0, 20.0]","(17.996, 19.333]","(17.999, 19.0]",1


In [19]:
df2 = pd.read_excel('winequality.xlsx')
df2

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,20.7,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.6,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.9,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.5,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [20]:
def iqr_outlier_capping(df, feature):
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    df[feature] = np.where(df[feature] < lower_bound, lower_bound, df[feature])
    df[feature] = np.where(df[feature] > upper_bound, upper_bound, df[feature])
    return df

In [29]:
sugar_df=iqr_outlier_capping(df2,'residual sugar')
sugar_df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,17.55,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.00,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.20,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.30,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.00,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [22]:
def percentile_capping(df, feature, lower_percentile=1, upper_percentile=99):
    lower_bound = np.percentile(df[feature], lower_percentile)
    upper_bound = np.percentile(df[feature], upper_percentile)
    
    df[feature] = np.clip(df[feature], lower_bound, upper_bound)
    return df

In [30]:
alc_df=percentile_capping(df2,'alcohol')
alc_df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,17.55,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.00,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.20,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.30,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.00,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [33]:
def zscore_capping(df, feature, threshold=3):
    mean = df[feature].mean()
    std = df[feature].std()
    
    df[feature] = np.where(np.abs(stats.zscore(df[feature])) > threshold,
                           mean + threshold * std * np.sign(df[feature] - mean),
                           df[feature])
    return df

In [34]:
wine_df = zscore_capping(df2, 'pH')
wine_df

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.270,0.36,17.55,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6
1,white,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6
2,white,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6
3,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
4,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.00,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
6493,red,5.9,0.550,0.10,2.20,0.062,39.0,51.0,0.99512,3.52,,11.2,6
6494,red,6.3,0.510,0.13,2.30,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
6495,red,5.9,0.645,0.12,2.00,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


In [35]:
df2['residual_sugar_bin'] = pd.cut(wine_df['residual sugar'], bins=[0, 5, 10, 15], include_lowest=True)
df2['total_sulfur_dioxide_bin'] = pd.cut(wine_df['total sulfur dioxide'], bins=[0, 50, 100, 150], include_lowest=True)
df2

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,residual_sugar_bin,total_sulfur_dioxide_bin
0,white,7.0,0.270,0.36,17.55,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,,
1,white,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,"(-0.001, 5.0]","(100.0, 150.0]"
2,white,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,"(5.0, 10.0]","(50.0, 100.0]"
3,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,"(5.0, 10.0]",
4,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,"(5.0, 10.0]",
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.00,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,"(-0.001, 5.0]","(-0.001, 50.0]"
6493,red,5.9,0.550,0.10,2.20,0.062,39.0,51.0,0.99512,3.52,,11.2,6,"(-0.001, 5.0]","(50.0, 100.0]"
6494,red,6.3,0.510,0.13,2.30,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,"(-0.001, 5.0]","(-0.001, 50.0]"
6495,red,5.9,0.645,0.12,2.00,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,"(-0.001, 5.0]","(-0.001, 50.0]"


In [36]:
df2['free_sulfur_dioxide_bin'] = pd.qcut(df2['free sulfur dioxide'], q=3)
df2['fixed_acidity_bin'] = pd.qcut(df2['fixed acidity'], q=3)
df2

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,residual_sugar_bin,total_sulfur_dioxide_bin,free_sulfur_dioxide_bin,fixed_acidity_bin
0,white,7.0,0.270,0.36,17.55,0.045,45.0,170.0,1.00100,3.00,0.45,8.8,6,,,"(36.0, 289.0]","(6.6, 7.4]"
1,white,6.3,0.300,0.34,1.60,0.049,14.0,132.0,0.99400,3.30,0.49,9.5,6,"(-0.001, 5.0]","(100.0, 150.0]","(0.999, 21.0]","(3.799, 6.6]"
2,white,8.1,0.280,0.40,6.90,0.050,30.0,97.0,0.99510,3.26,0.44,10.1,6,"(5.0, 10.0]","(50.0, 100.0]","(21.0, 36.0]","(7.4, 15.9]"
3,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,"(5.0, 10.0]",,"(36.0, 289.0]","(6.6, 7.4]"
4,white,7.2,0.230,0.32,8.50,0.058,47.0,186.0,0.99560,3.19,0.40,9.9,6,"(5.0, 10.0]",,"(36.0, 289.0]","(6.6, 7.4]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6492,red,6.2,0.600,0.08,2.00,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5,"(-0.001, 5.0]","(-0.001, 50.0]","(21.0, 36.0]","(3.799, 6.6]"
6493,red,5.9,0.550,0.10,2.20,0.062,39.0,51.0,0.99512,3.52,,11.2,6,"(-0.001, 5.0]","(50.0, 100.0]","(36.0, 289.0]","(3.799, 6.6]"
6494,red,6.3,0.510,0.13,2.30,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6,"(-0.001, 5.0]","(-0.001, 50.0]","(21.0, 36.0]","(3.799, 6.6]"
6495,red,5.9,0.645,0.12,2.00,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5,"(-0.001, 5.0]","(-0.001, 50.0]","(21.0, 36.0]","(3.799, 6.6]"


In [72]:
df3 = pd.read_excel('Bengaluru_House_Data.xlsx')
df3

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,2024-12-19 00:00:00,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,2024-06-18 00:00:00,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


In [73]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [74]:
df4= iqr_outlier_capping(df3,'price')
df4

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,2024-12-19 00:00:00,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,225.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,225.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,2024-06-18 00:00:00,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,225.00


In [76]:
df4 = percentile_capping(df3, 'price')
df4

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,2024-12-19 00:00:00,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,225.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,225.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,2024-06-18 00:00:00,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,225.00


In [77]:
df4 = zscore_capping(df4, 'price')
df4

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,2024-12-19 00:00:00,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,225.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,225.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,2024-06-18 00:00:00,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,225.00
