In [43]:
import pandas as pd

car_insurance = pd.read_csv(datacamp/carinsurance/car_insurance.csv')
car_insurance.head()
car_insurance.dtypes

# Get the number of rows using the shape attribute
num_rows = car_insurance.shape[0]

# Print the number of rows
print(num_rows)

10000


In [44]:
missing_values = car_insurance.isnull().sum()

# Print the missing values count
print(missing_values)

id                       0
age                      0
gender                   0
driving_experience       0
education                0
income                   0
credit_score           982
vehicle_ownership        0
vehicle_year             0
married                  0
children                 0
postal_code              0
annual_mileage         957
vehicle_type             0
speeding_violations      0
duis                     0
past_accidents           0
outcome                  0
dtype: int64


In [45]:
# there are two columns with missing values (less than 10%)
# instead of deleting them, I'll replace the missing values with the averages
# first we calculate the means
credit_score_mean = car_insurance['credit_score'].mean()
annual_mileage_mean = car_insurance['annual_mileage'].mean()


0.5158128096027941
11697.003206900365


In [46]:
# then we fill the missing values with the mean
car_insurance['credit_score'].fillna(credit_score_mean, inplace=True)
car_insurance['annual_mileage'].fillna(annual_mileage_mean, inplace=True)


0.5158128096027874
11697.00320690015


In [56]:
# let's check the datatypes
# for ordinal and categorical variables, uncomment the line below
#car_insurance['driving_experience'].unique()
car_insurance.dtypes

id                              int64
age                             int64
gender                          int64
driving_experience             object
education                      object
income                         object
credit_score                  float64
vehicle_ownership             float64
married                       float64
children                      float64
postal_code                     int64
annual_mileage                float64
speeding_violations             int64
duis                            int64
past_accidents                  int64
outcome                       float64
education_encoded               int64
income_encoded                  int64
driving_experience_encoded      int64
vehicle_year_before 2015        uint8
vehicle_type_sports car         uint8
dtype: object

In [48]:
# import logistic regression libraries
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [49]:
#this is the preprocessing stage
#map ordinal string variables

ordinal_mapping = {
    'education':{'none':0, 'high school':1, 'university':2},
    'income':{'poverty':0, 'working class':1, 'middle class':2, 'upper class':3},
    'driving_experience':{'0-9y':0, '10-19y':1, '20-29y':2, '30y+':3}
}

for variable, mapping in ordinal_mapping.items():
    car_insurance[variable + '_encoded'] = car_insurance[variable].map(mapping)
    
# convert vehicle year and type to dummy variables

car_insurance = pd.get_dummies(car_insurance, columns=['vehicle_year', 'vehicle_type'], drop_first = True)


In [50]:
# define independent and dependent variables lists

features = ['age', 'gender', 'driving_experience_encoded', 'education_encoded', 'income_encoded', 'credit_score', 'vehicle_ownership',
            'married', 'children', 'postal_code', 'annual_mileage', 'vehicle_type_sports car', 'speeding_violations', 'duis',
            'past_accidents', 'vehicle_year_before 2015']

target = 'outcome'

In [51]:
#split the dataset

X_train, X_test, y_train, y_test = train_test_split(car_insurance[features], car_insurance[target], test_size=0.20, random_state=42)



In [52]:
# create and fit logisti regression model

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()

In [53]:
coefficients = pd.DataFrame({'Variable':features, 'Coefficient':model.coef_[0]})
coefficients = coefficients.sort_values('Coefficient', key= abs, ascending = False)

In [54]:
print(coefficients)

                      Variable  Coefficient
2   driving_experience_encoded    -0.720794
6            vehicle_ownership    -0.630416
4               income_encoded    -0.495493
1                       gender     0.476971
15    vehicle_year_before 2015     0.467906
0                          age    -0.402705
14              past_accidents    -0.305538
7                      married    -0.173997
12         speeding_violations     0.127347
8                     children    -0.103509
13                        duis    -0.064639
3            education_encoded    -0.052586
5                 credit_score    -0.051554
11     vehicle_type_sports car    -0.000282
10              annual_mileage     0.000073
9                  postal_code     0.000018
