In [1]:
import pandas as pd

# Load the dataset (replace 'your_dataset.csv' with the file path)
df = pd.read_csv("data/dataset.csv")

# Display the first few rows to understand the structure
print(df.head())


  Agency Type Distribution Channel Product Name Claim  Duration  \
0    Airlines               Online   Basic Plan    No     104.0   
1    Airlines               Online   Basic Plan    No       2.0   
2    Airlines               Online   Basic Plan    No       3.0   
3    Airlines               Online   Value Plan    No       1.0   
4    Airlines               Online   Basic Plan    No      14.0   

                 Destination  Net Sales  Commission (in value) Gender    Age  
0  TAIWAN, PROVINCE OF CHINA       35.0                  12.25      M   34.0  
1                   MALAYSIA       18.0                   6.30      M   44.0  
2                   MALAYSIA       18.0                   6.30      M   45.0  
3                      INDIA       62.0                  24.80      M  118.0  
4                  HONG KONG       26.0                   9.10      F   43.0  


In [2]:
# Check for missing values and calculate the percentage
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

# Combine into a single DataFrame for better readability
missing_summary = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage
}).sort_values(by='Percentage', ascending=False)

# Display the summary
print(missing_summary)



                       Missing Values  Percentage
Agency Type                         0         0.0
Distribution Channel                0         0.0
Product Name                        0         0.0
Claim                               0         0.0
Duration                            0         0.0
Destination                         0         0.0
Net Sales                           0         0.0
Commission (in value)               0         0.0
Gender                              0         0.0
Age                                 0         0.0


In [3]:
# Drop a single column by name
df.dropna()


Unnamed: 0,Agency Type,Distribution Channel,Product Name,Claim,Duration,Destination,Net Sales,Commission (in value),Gender,Age
0,Airlines,Online,Basic Plan,No,104.0,"TAIWAN, PROVINCE OF CHINA",35.0,12.25,M,34.0
1,Airlines,Online,Basic Plan,No,2.0,MALAYSIA,18.0,6.30,M,44.0
2,Airlines,Online,Basic Plan,No,3.0,MALAYSIA,18.0,6.30,M,45.0
3,Airlines,Online,Value Plan,No,1.0,INDIA,62.0,24.80,M,118.0
4,Airlines,Online,Basic Plan,No,14.0,HONG KONG,26.0,9.10,F,43.0
...,...,...,...,...,...,...,...,...,...,...
4156,Airlines,Online,Bronze Plan,No,15.0,SINGAPORE,23.5,5.88,F,69.0
4157,Airlines,Online,Bronze Plan,No,20.0,SINGAPORE,27.0,6.75,F,51.0
4158,Airlines,Online,Bronze Plan,No,16.0,SINGAPORE,27.0,6.75,F,34.0
4159,Airlines,Online,Basic Plan,No,15.0,MALAYSIA,18.0,6.30,M,35.0


In [4]:
df.dtypes

Agency Type               object
Distribution Channel      object
Product Name              object
Claim                     object
Duration                 float64
Destination               object
Net Sales                float64
Commission (in value)    float64
Gender                    object
Age                      float64
dtype: object

In [6]:
# Change column data types
df = df.astype({
    'Agency Type': 'string',
    'Distribution Channel': 'string',
    'Product Name': 'string',
    'Claim': 'string',
    'Duration': 'float64',
    'Destination': 'string',
    'Net Sales': 'float64',
    'Commission (in value)': 'float64'
})

# Verify the changes
print(df.dtypes)

Agency Type              string[python]
Distribution Channel     string[python]
Product Name             string[python]
Claim                    string[python]
Duration                        float64
Destination              string[python]
Net Sales                       float64
Commission (in value)           float64
Gender                           object
Age                             float64
dtype: object


In [8]:
print(df.describe())


          Duration    Net Sales  Commission (in value)          Age
count  4161.000000  4161.000000            4161.000000  4161.000000
mean     58.475366    53.000000              16.559793    46.797164
std      98.861468    73.587873              18.737965    21.122074
min      -2.000000  -389.000000               0.770000     1.000000
25%       9.000000    22.000000               7.700000    33.000000
50%      20.000000    29.000000               9.570000    43.000000
75%      51.000000    46.150000              15.750000    53.000000
max     740.000000   810.000000             283.500000   118.000000


In [9]:
from sklearn.preprocessing import MinMaxScaler

# Numerical features to scale
numerical_features = ['Duration', 'Net Sales', 'Commission (in value)', 'Age']

# Categorical features to include
categorical_features = ['Agency Type', 'Distribution Channel', 'Product Name', 'Destination', 'Claim']

# Scale numerical features
scaler = MinMaxScaler()
scaled_numerical = scaler.fit_transform(df[numerical_features])

# Create a DataFrame for scaled numerical features
df_scaled_numerical = pd.DataFrame(scaled_numerical, columns=numerical_features)

# Combine scaled numerical and categorical features
df_final = pd.concat([df_scaled_numerical, df[categorical_features].reset_index(drop=True)], axis=1)

# Verify the resulting DataFrame
print(df_final.head())




   Duration  Net Sales  Commission (in value)       Age Agency Type  \
0  0.142857   0.353628               0.040604  0.282051    Airlines   
1  0.005391   0.339450               0.019559  0.367521    Airlines   
2  0.006739   0.339450               0.019559  0.376068    Airlines   
3  0.004043   0.376147               0.084993  1.000000    Airlines   
4  0.021563   0.346122               0.029463  0.358974    Airlines   

  Distribution Channel Product Name                Destination Claim  
0               Online   Basic Plan  TAIWAN, PROVINCE OF CHINA    No  
1               Online   Basic Plan                   MALAYSIA    No  
2               Online   Basic Plan                   MALAYSIA    No  
3               Online   Value Plan                      INDIA    No  
4               Online   Basic Plan                  HONG KONG    No  


In [10]:
df_final.dtypes

Duration                        float64
Net Sales                       float64
Commission (in value)           float64
Age                             float64
Agency Type              string[python]
Distribution Channel     string[python]
Product Name             string[python]
Destination              string[python]
Claim                    string[python]
dtype: object