# Import the libraries and read the data

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px


In [2]:
pd.set_option('display.max_columns',None)
df=pd.read_csv('health_insurance.csv')
df

Unnamed: 0,person_id,age,sex,region,urban_rural,income,education,household_size,dependents,bmi,smoker,visits_last_year,medication_count,systolic_bp,diastolic_bp,ldl,hba1c,plan_type,deductible,copay,risk_score,annual_medical_cost,claims_count,chronic_count
0,75722,52.0,Female,North,Suburban,"$22,700",Doctorate,3,1,27.4,Never,2,4,121,76,123.8,5.28,PPO,1000.0,20,0.5714,6938.06,1,1
1,80185,,Female,North,Urban,"$12,800",No HS,3,1,26.6,Never,2,3,131,79,97.3,4.82,POS,1000.0,10,1.0000,1632.61,4,2
2,19865,68.0,Male,North,Rural,"$40,700",HS,5,3,31.5,Never,1,4,160,84,129.5,5.51,HMO,500.0,20,1.0000,7661.01,0,3
3,76700,15.0,Male,North,Suburban,"$15,600",Some College,5,3,31.6,,0,1,104,68,160.3,8.50,HMO,500.0,20,0.2857,5130.27,0,1
4,92992,53.0,Male,Central,Suburban,"$89,600",Doctorate,2,0,30.5,,3,2,136,83,171.0,5.20,POS,500.0,10,0.8681,1700.73,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,33073,28.0,Male,South,Rural,"$19,300",Bachelors,1,0,,Never,0,1,112,64,129.5,5.12,PPO,500.0,30,0.1099,380.25,0,0
49996,95600,70.0,Male,North,Suburban,,Bachelors,3,2,33.7,Former,2,0,137,70,134.6,5.59,PPO,1000.0,10,0.8571,1496.91,3,1
49997,28007,34.0,Male,East,Urban,"$66,600",Some College,1,0,26.6,,0,2,104,78,180.4,8.30,PPO,500.0,20,0.5714,1295.18,0,2
49998,90488,,Female,West,Suburban,"$33,000",Some College,1,0,28.6,Never,0,1,99,80,100.7,5.48,EPO,1000.0,10,0.4286,1351.82,0,0


# Data understanding 

In [3]:
column_description = {
    "person_id": "Unique ID for each person",
    "age": "Age of the individual",
    "sex": "Gender of the person (Male/Female)",
    "region": "Geographical region where the person lives",
    "urban_rural": "Type of area (Urban, Rural, Suburban)",
    "income": "Annual income of the person",
    "education": "Education level (HS, Some College, Bachelor, etc.)",
    "household_size": "Number of people living in the household",
    "dependents": "Number of dependents the person supports",
    "bmi": "Body Mass Index",
    "smoker": "Whether the person is a smoker (Yes/No)",
    "visits_last_year": "Number of medical visits last year",
    "medication_count": "Number of medications the person is taking",
    "systolic_bp": "Systolic blood pressure",
    "diastolic_bp": "Diastolic blood pressure",
    "ldl": "LDL cholesterol level",
    "hba1c": "HbA1c level (indicator of diabetes)",
    "plan_type": "Health insurance plan type (HMO/PPO/etc.)",
    "deductible": "Deductible amount before insurance coverage",
    "copay": "Copayment amount per visit or service",
    "risk_score": "Health risk score of the individual",
    "annual_medical_cost": "Total medical cost per year",
    "claims_count": "Number of insurance claims filed",
    "chronic_count": "Number of chronic conditions"
}
column_description_df=pd.DataFrame(column_description.items(),columns=["Column Name", "Description"])

column_description_df

Unnamed: 0,Column Name,Description
0,person_id,Unique ID for each person
1,age,Age of the individual
2,sex,Gender of the person (Male/Female)
3,region,Geographical region where the person lives
4,urban_rural,"Type of area (Urban, Rural, Suburban)"
5,income,Annual income of the person
6,education,"Education level (HS, Some College, Bachelor, e..."
7,household_size,Number of people living in the household
8,dependents,Number of dependents the person supports
9,bmi,Body Mass Index


# Data exploration

###### check data types

In [4]:
df.info()
#income to numbers

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 24 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   person_id            50000 non-null  int64  
 1   age                  45512 non-null  float64
 2   sex                  49073 non-null  object 
 3   region               50000 non-null  object 
 4   urban_rural          50000 non-null  object 
 5   income               45424 non-null  object 
 6   education            48993 non-null  object 
 7   household_size       50000 non-null  int64  
 8   dependents           50000 non-null  int64  
 9   bmi                  45408 non-null  float64
 10  smoker               45530 non-null  object 
 11  visits_last_year     50000 non-null  int64  
 12  medication_count     50000 non-null  int64  
 13  systolic_bp          50000 non-null  int64  
 14  diastolic_bp         50000 non-null  int64  
 15  ldl                  50000 non-null 

###### check describe() for numbbers and object 

In [5]:
df.describe()
# age =0 


Unnamed: 0,person_id,age,household_size,dependents,bmi,visits_last_year,medication_count,systolic_bp,diastolic_bp,ldl,hba1c,deductible,copay,risk_score,annual_medical_cost,claims_count,chronic_count
count,50000.0,45512.0,50000.0,50000.0,45408.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,48959.0,50000.0,50000.0,45460.0,50000.0,50000.0
mean,50040.40736,47.450299,2.42988,0.89776,26.979689,1.93206,1.23856,117.7268,73.58786,119.884862,5.605112,1231.693866,19.5418,0.519172,3015.351869,1.6312,0.72502
std,28770.044668,16.016059,1.073586,0.950602,5.004383,1.735288,1.209776,15.346027,8.905267,30.255013,0.843048,1022.406464,10.293793,0.251677,3165.77478,2.032946,0.806089
min,5.0,0.0,1.0,0.0,12.0,0.0,0.0,63.0,42.0,30.0,3.54,500.0,10.0,0.0,55.55,0.0,0.0
25%,25331.75,37.0,2.0,0.0,23.6,1.0,0.0,107.0,67.0,99.4,5.16,500.0,10.0,0.3297,1174.12,0.0,0.0
50%,49929.5,47.0,2.0,1.0,27.0,2.0,1.0,117.0,73.0,120.0,5.44,1000.0,20.0,0.5055,2088.14,1.0,1.0
75%,74833.25,58.0,3.0,1.0,30.4,3.0,2.0,128.0,79.0,140.2,5.76,2000.0,30.0,0.7033,3709.0825,2.0,1.0
max,100000.0,100.0,9.0,7.0,50.4,18.0,9.0,183.0,114.0,239.5,11.94,5000.0,50.0,1.0,65724.9,20.0,6.0


In [6]:
df.describe(include='object')
#unique(sex)=3

Unnamed: 0,sex,region,urban_rural,income,education,smoker,plan_type
count,49073,50000,50000,45424,48993,45530,50000
unique,3,5,3,2471,6,3,4
top,Female,South,Urban,"$21,400",Bachelors,Never,PPO
freq,24196,13975,29983,104,13733,31750,17604


###### check duplicated and drop it

In [7]:
df.duplicated().sum()

np.int64(0)

###### check messing values

In [8]:
df.isna().mean()*100
#age                    8.976
#sex                    1.854
#income                 9.152
#education              2.014
#bmi                    9.184
#smoker                 8.940
#deductible             2.082
#annual_medical_cost    9.080


person_id              0.000
age                    8.976
sex                    1.854
region                 0.000
urban_rural            0.000
income                 9.152
education              2.014
household_size         0.000
dependents             0.000
bmi                    9.184
smoker                 8.940
visits_last_year       0.000
medication_count       0.000
systolic_bp            0.000
diastolic_bp           0.000
ldl                    0.000
hba1c                  0.000
plan_type              0.000
deductible             2.082
copay                  0.000
risk_score             0.000
annual_medical_cost    9.080
claims_count           0.000
chronic_count          0.000
dtype: float64

In [9]:
100-(df.dropna().shape[0]/df.shape[0]*100)
# معني كدا اننا هنخسر 42 % لو عملنا dropna للكل 

41.55799999999999

# DATA CLEANING

###### drop the columns that we dont need

In [10]:
df.drop('person_id',inplace=True,axis=1)
df.duplicated().sum()

np.int64(0)

###### convert to numerical

In [11]:
def fix_income(x):
    if pd.isna(x):
        return None
    else :
        x=str(x)
        return float((x[1:].replace(',','')))
fix_income('$22,700')

22700.0

In [12]:
df['income']=df['income'].apply(fix_income)

###### handle age=0 , unique(sex)=3

In [13]:
# مش منطقي يبقي عندي طفل اقل من 15   لكن دخله عالي بالمنظر دا او يكون عنده   مرحلة تعليميه عاليه 

age_less_15 = (
    (df['age'] < 15) &
    (
        (df['income'] > 0) |
        (df['education'].isin(['HS','Some College','Bachelors','Masters','Doctorate'])) |
        (df['chronic_count'] > 0)
    )
)

age_less_15_index=df[age_less_15].index # 888 rows
df = df.drop(age_less_15_index)


In [14]:
df['sex'].unique()


array(['Female', 'Male', nan, 'Other'], dtype=object)

In [15]:
df=df.drop((df[df['sex']=='Other']).index  ).reset_index(drop=True)
df

Unnamed: 0,age,sex,region,urban_rural,income,education,household_size,dependents,bmi,smoker,visits_last_year,medication_count,systolic_bp,diastolic_bp,ldl,hba1c,plan_type,deductible,copay,risk_score,annual_medical_cost,claims_count,chronic_count
0,52.0,Female,North,Suburban,22700.0,Doctorate,3,1,27.4,Never,2,4,121,76,123.8,5.28,PPO,1000.0,20,0.5714,6938.06,1,1
1,,Female,North,Urban,12800.0,No HS,3,1,26.6,Never,2,3,131,79,97.3,4.82,POS,1000.0,10,1.0000,1632.61,4,2
2,68.0,Male,North,Rural,40700.0,HS,5,3,31.5,Never,1,4,160,84,129.5,5.51,HMO,500.0,20,1.0000,7661.01,0,3
3,15.0,Male,North,Suburban,15600.0,Some College,5,3,31.6,,0,1,104,68,160.3,8.50,HMO,500.0,20,0.2857,5130.27,0,1
4,53.0,Male,Central,Suburban,89600.0,Doctorate,2,0,30.5,,3,2,136,83,171.0,5.20,POS,500.0,10,0.8681,1700.73,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48120,28.0,Male,South,Rural,19300.0,Bachelors,1,0,,Never,0,1,112,64,129.5,5.12,PPO,500.0,30,0.1099,380.25,0,0
48121,70.0,Male,North,Suburban,,Bachelors,3,2,33.7,Former,2,0,137,70,134.6,5.59,PPO,1000.0,10,0.8571,1496.91,3,1
48122,34.0,Male,East,Urban,66600.0,Some College,1,0,26.6,,0,2,104,78,180.4,8.30,PPO,500.0,20,0.5714,1295.18,0,2
48123,,Female,West,Suburban,33000.0,Some College,1,0,28.6,Never,0,1,99,80,100.7,5.48,EPO,1000.0,10,0.4286,1351.82,0,0


###### in depth check for object columns

In [16]:
col_name=df.select_dtypes(object).columns
for col in col_name:
    print(col)
    print(df[col].nunique())
    print(df[col].unique())
    
    print('-'*30)

sex
2
['Female' 'Male' nan]
------------------------------
region
5
['North' 'Central' 'West' 'South' 'East']
------------------------------
urban_rural
3
['Suburban' 'Urban' 'Rural']
------------------------------
education
6
['Doctorate' 'No HS' 'HS' 'Some College' 'Masters' 'Bachelors' nan]
------------------------------
smoker
3
['Never' nan 'Current' 'Former']
------------------------------
plan_type
4
['PPO' 'POS' 'HMO' 'EPO']
------------------------------


###### in depth check for number columns

In [17]:
num_col=df.select_dtypes('number').columns
for col in num_col:
    px.histogram(data_frame=df,x=col,title=col).show()


#### Handle messing value

##### drop

In [18]:
df.dropna(subset=['sex','education','deductible'],inplace=True,ignore_index=True)


#dropna 
#sex                    1.854
#education              2.014
#deductible             2.082

##### fill after splitting

# FEATURE ENGINEERING

In [19]:
# متوسط الدخل للفرد بناءا علي الاسره 

df["income_per_person"] = df["income"] / df["household_size"]

In [20]:
#نسبة ضغط الدم الانقباضي إلى الانبساطي. 

df["bp_ratio"] = df["systolic_bp"] / df["diastolic_bp"]

# preprocessing for ml model

#### input & output split

In [22]:
x=df.drop('annual_medical_cost',axis=1) # the input
y=df['annual_medical_cost']             # the outout

#### TRAIN TEST SPLIT

In [23]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=42)

In [24]:
x_train

Unnamed: 0,age,sex,region,urban_rural,income,education,household_size,dependents,bmi,smoker,visits_last_year,medication_count,systolic_bp,diastolic_bp,ldl,hba1c,plan_type,deductible,copay,risk_score,claims_count,chronic_count,income_per_person,bp_ratio
7978,59.0,Male,South,Suburban,9900.0,Bachelors,1,0,27.5,Former,0,1,126,80,112.3,5.21,POS,500.0,30,0.4505,0,0,9900.0,1.575000
40274,,Male,South,Urban,73600.0,No HS,2,0,37.4,Never,3,2,92,75,81.0,5.38,EPO,1000.0,30,0.4505,4,1,36800.0,1.226667
5421,32.0,Female,South,Suburban,40500.0,HS,4,2,22.0,Never,2,2,134,84,83.5,6.09,PPO,2000.0,30,0.3516,0,1,10125.0,1.595238
14668,35.0,Male,West,Urban,87500.0,HS,1,0,16.4,Never,0,4,96,73,121.4,5.29,EPO,1000.0,10,0.3846,0,1,87500.0,1.315068
8982,42.0,Female,South,Urban,28400.0,Masters,2,0,16.3,Never,2,2,115,88,132.2,5.08,HMO,500.0,30,0.4615,1,1,14200.0,1.306818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11284,54.0,Female,West,Rural,23200.0,Bachelors,2,1,34.0,Never,2,1,133,73,102.6,5.79,PPO,2000.0,20,0.6813,1,1,11600.0,1.821918
44732,47.0,Female,West,Urban,16100.0,Masters,4,2,22.1,Never,1,1,114,85,137.1,4.87,PPO,1000.0,30,0.5165,0,1,4025.0,1.341176
38158,41.0,Male,North,Urban,6300.0,Some College,2,0,30.1,Never,5,2,137,71,96.1,4.89,POS,500.0,20,0.7363,7,2,3150.0,1.929577
860,,Male,South,Rural,29700.0,Some College,3,1,28.6,Never,2,2,117,63,125.3,5.68,PPO,500.0,10,0.4835,2,1,9900.0,1.857143


In [25]:
x_test

Unnamed: 0,age,sex,region,urban_rural,income,education,household_size,dependents,bmi,smoker,visits_last_year,medication_count,systolic_bp,diastolic_bp,ldl,hba1c,plan_type,deductible,copay,risk_score,claims_count,chronic_count,income_per_person,bp_ratio
4337,45.0,Female,East,Urban,4600.0,Bachelors,3,1,23.0,Never,3,2,132,81,114.6,6.25,PPO,500.0,30,0.4945,1,1,1533.333333,1.629630
26834,44.0,Male,North,Urban,131500.0,Masters,1,0,26.3,,2,2,141,98,96.6,5.52,PPO,500.0,20,0.4835,1,1,131500.000000,1.438776
44174,87.0,Female,South,Suburban,45300.0,Bachelors,2,0,28.0,Never,0,3,160,91,189.9,5.92,PPO,2000.0,10,0.9560,0,1,22650.000000,1.758242
1489,76.0,Female,East,Urban,19500.0,Some College,3,1,28.9,,0,0,117,92,133.3,5.21,PPO,2000.0,10,0.6374,0,0,6500.000000,1.271739
9444,38.0,Male,Central,Urban,189500.0,HS,2,0,31.1,Former,3,0,128,63,117.8,5.81,PPO,1000.0,10,0.5055,2,1,94750.000000,2.031746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18399,77.0,Female,West,Suburban,59200.0,No HS,3,2,27.4,Never,3,0,150,88,137.7,5.61,HMO,500.0,10,0.8462,3,1,19733.333333,1.704545
5306,58.0,Male,Central,Urban,21100.0,Some College,1,0,,Never,2,2,101,76,120.8,9.17,EPO,1000.0,10,0.8352,2,2,21100.000000,1.328947
2577,41.0,Male,Central,Urban,13300.0,Masters,3,1,28.4,Never,2,1,115,71,76.7,6.09,EPO,500.0,10,0.4505,1,1,4433.333333,1.619718
9237,30.0,Male,East,Urban,,Bachelors,1,0,30.9,Never,2,1,103,65,100.0,5.88,HMO,1000.0,30,0.4176,1,1,,1.584615


In [26]:
y_train

7978     1498.78
40274    3524.69
5421     4927.17
14668    1078.79
8982     1558.67
          ...   
11284        NaN
44732    3508.52
38158    4971.58
860       757.95
15795    1932.43
Name: annual_medical_cost, Length: 36230, dtype: float64

In [27]:
y_test

4337     3825.32
26834    1783.29
44174     876.92
1489     2976.12
9444     2372.03
          ...   
18399    3232.34
5306         NaN
2577      771.30
9237     1453.14
16690    3551.80
Name: annual_medical_cost, Length: 9058, dtype: float64

### handle messing value & scaling the numerical columns

In [28]:
x.isna().mean()*100
#age                  9.156951
#income               9.084084
#bmi                  9.114997
#smoker               8.907437
#income_per_person    9.084084


age                  9.156951
sex                  0.000000
region               0.000000
urban_rural          0.000000
income               9.084084
education            0.000000
household_size       0.000000
dependents           0.000000
bmi                  9.114997
smoker               8.907437
visits_last_year     0.000000
medication_count     0.000000
systolic_bp          0.000000
diastolic_bp         0.000000
ldl                  0.000000
hba1c                0.000000
plan_type            0.000000
deductible           0.000000
copay                0.000000
risk_score           0.000000
claims_count         0.000000
chronic_count        0.000000
income_per_person    9.084084
bp_ratio             0.000000
dtype: float64

In [29]:
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='median')

x_train[['age']] = num_imputer.fit_transform(x_train[['age']])
x_test[['age']]  = num_imputer.transform(x_test[['age']])

In [30]:
cat_cols = ['smoker']
cat_imputer = SimpleImputer(strategy='most_frequent')

x_train[cat_cols] = cat_imputer.fit_transform(x_train[cat_cols])
x_test[cat_cols]  = cat_imputer.transform(x_test[cat_cols])

In [31]:
from sklearn.impute import KNNImputer
knn=KNNImputer()

x_train[['income','bmi','income_per_person']]=knn.fit_transform(x_train[['income','bmi','income_per_person']])
x_test[['income','bmi','income_per_person']]=knn.transform(x_test[['income','bmi','income_per_person']])

In [32]:
x_train.isna().sum()

age                  0
sex                  0
region               0
urban_rural          0
income               0
education            0
household_size       0
dependents           0
bmi                  0
smoker               0
visits_last_year     0
medication_count     0
systolic_bp          0
diastolic_bp         0
ldl                  0
hba1c                0
plan_type            0
deductible           0
copay                0
risk_score           0
claims_count         0
chronic_count        0
income_per_person    0
bp_ratio             0
dtype: int64

##### scaling

In [33]:
# use Robust Scaler
from sklearn.preprocessing import RobustScaler
robust_scaler=RobustScaler()

num_col=x_train.select_dtypes(include='number').columns
x_train[num_col]=robust_scaler.fit_transform(x_train[num_col])
x_test[num_col]=robust_scaler.transform(x_test[num_col])

In [34]:
x_train.head()

Unnamed: 0,age,sex,region,urban_rural,income,education,household_size,dependents,bmi,smoker,visits_last_year,medication_count,systolic_bp,diastolic_bp,ldl,hba1c,plan_type,deductible,copay,risk_score,claims_count,chronic_count,income_per_person,bp_ratio
7978,0.578947,Male,South,Suburban,-0.705581,Bachelors,-1.0,-1.0,0.086757,Former,-1.0,0.0,0.428571,0.636364,-0.191176,-0.383333,POS,-0.333333,1.0,-0.151682,-0.5,-1.0,-0.339943,-0.095557
40274,0.0,Male,South,Urban,0.928801,No HS,0.0,-1.0,1.689996,Never,0.5,0.5,-1.190476,0.181818,-0.958333,-0.1,EPO,0.0,1.0,-0.151682,1.5,0.0,0.930123,-1.303127
5421,-0.842105,Female,South,Suburban,0.079538,HS,2.0,1.0,-0.803931,Never,0.0,0.5,0.809524,1.0,-0.897059,1.083333,PPO,0.666667,1.0,-0.424435,-0.5,0.0,-0.32932,-0.025397
14668,-0.684211,Male,West,Urban,1.285439,HS,-1.0,-1.0,-1.710814,Never,-1.0,1.5,-1.0,0.0,0.031863,-0.25,EPO,0.0,-1.0,-0.333425,-0.5,0.0,3.32389,-0.996664
8982,-0.315789,Female,South,Urban,-0.230917,Masters,0.0,-1.0,-1.727008,Never,0.0,0.5,-0.095238,1.363636,0.296569,-0.6,HMO,-0.333333,1.0,-0.121346,0.0,0.0,-0.136922,-1.025265


### encoding

In [38]:
cat_col=x_train.select_dtypes(include='object').columns
cat_col

Index(['sex', 'region', 'urban_rural', 'education', 'smoker', 'plan_type'], dtype='object')

In [43]:
for  col in cat_col:
    print(col)
    print(x_train[col].nunique())
    print(x_train[col].unique())
    print('-'*8)
#  education, smoker --> ordinal
# 'region' ,'urban_rural', plan_type,sex -->OneHot

sex
2
['Male' 'Female']
--------
region
5
['South' 'West' 'Central' 'East' 'North']
--------
urban_rural
3
['Suburban' 'Urban' 'Rural']
--------
education
6
['Bachelors' 'No HS' 'HS' 'Masters' 'Some College' 'Doctorate']
--------
smoker
3
['Former' 'Never' 'Current']
--------
plan_type
4
['POS' 'EPO' 'PPO' 'HMO']
--------


##### encode with ordinalencoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder


education_order = ['No HS', 'HS', 'Some College', 'Bachelors', 'Masters', 'Doctorate']
smoker_order = ['Never', 'Former', 'Current']

ord_enc = OrdinalEncoder(categories=[education_order, smoker_order])

x_train[['education','smoker']] = ord_enc.fit_transform(x_train[['education','smoker']])
x_test[['education','smoker']] = ord_enc.transform(x_test[['education','smoker']])



##### encode nominal

In [46]:
ohe_col_name=x_train.select_dtypes(object).columns
ohe_col_name

Index(['sex', 'region', 'urban_rural', 'education', 'smoker', 'plan_type'], dtype='object')

In [47]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(drop='first',sparse_output=False)

ohe_arr_train=ohe.fit_transform(x_train[ohe_col_name])
ohe_arr_test=ohe.transform(x_test[ohe_col_name])

In [48]:
ohe_train=pd.DataFrame(ohe_arr_train,columns=ohe.get_feature_names_out()).reset_index()
ohe_test=pd.DataFrame(ohe_arr_test,columns=ohe.get_feature_names_out()).reset_index()

In [49]:
x_train=x_train.reset_index()
x_test=x_test.reset_index()
y_test=y_test.reset_index()
y_test=y_test.reset_index()

In [50]:
x_train=pd.concat([x_train,ohe_train],axis=1).drop(ohe_col_name,axis=1)
x_test=pd.concat([x_test,ohe_test],axis=1).drop(ohe_col_name,axis=1)