## Preparation

In [2]:
import pandas as pd

In [4]:
df=pd.read_csv('data.csv')

In [6]:
df.isnull().sum()

lead_source                 128
industry                    134
number_of_courses_viewed      0
annual_income               181
employment_status           100
location                     63
interaction_count             0
lead_score                    0
converted                     0
dtype: int64

In [7]:
df

Unnamed: 0,lead_source,industry,number_of_courses_viewed,annual_income,employment_status,location,interaction_count,lead_score,converted
0,paid_ads,,1,79450.0,unemployed,south_america,4,0.94,1
1,social_media,retail,1,46992.0,employed,south_america,1,0.80,0
2,events,healthcare,5,78796.0,unemployed,australia,3,0.69,1
3,paid_ads,retail,2,83843.0,,australia,1,0.87,0
4,referral,education,3,85012.0,self_employed,europe,3,0.62,1
...,...,...,...,...,...,...,...,...,...
1457,referral,manufacturing,1,,self_employed,north_america,4,0.53,1
1458,referral,technology,3,65259.0,student,europe,2,0.24,1
1459,paid_ads,technology,1,45688.0,student,north_america,3,0.02,1
1460,referral,,5,71016.0,self_employed,north_america,0,0.25,1


In [9]:
categorical = ['lead_source', 'industry', 'employment_status', 'location']
numerical = ['number_of_courses_viewed', 'annual_income', 'interaction_count', 'lead_score']

In [10]:
df[categorical] = df[categorical].fillna('NA')
df[numerical] = df[numerical].fillna(0.0)

In [11]:
print(df.isnull().sum())

lead_source                 0
industry                    0
number_of_courses_viewed    0
annual_income               0
employment_status           0
location                    0
interaction_count           0
lead_score                  0
converted                   0
dtype: int64


## Question 1

In [14]:
df['industry'].mode()[0]

'retail'

In [15]:
df['industry'].value_counts()

industry
retail           203
finance          200
other            198
healthcare       187
education        187
technology       179
manufacturing    174
NA               134
Name: count, dtype: int64

Answer: retail

## Question 2

In [16]:
corr_matrix = df[numerical].corr()

In [17]:
print(corr_matrix.loc['interaction_count', 'lead_score'])
print(corr_matrix.loc['number_of_courses_viewed', 'lead_score'])
print(corr_matrix.loc['number_of_courses_viewed', 'interaction_count'])
print(corr_matrix.loc['annual_income', 'interaction_count'])

0.009888182496913131
-0.004878998354681276
-0.023565222882888037
0.02703647240481443


Answer: annual_income and interaction_count

## Split the data

In [18]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.converted.values
y_val = df_val.converted.values
y_test = df_test.converted.values

del df_train['converted']
del df_val['converted']
del df_test['converted']

## Question 3

In [19]:
from sklearn.metrics import mutual_info_score

In [20]:
mi_scores = {}
for col in categorical:
    score = mutual_info_score(df_train[col], y_train)
    mi_scores[col] = round(score, 2)
    print(f"{col}: {mi_scores[col]}")

lead_source: 0.04
industry: 0.01
employment_status: 0.01
location: 0.0


In [21]:
max_feature = max(mi_scores, key=mi_scores.get)
print(f"\nHighest MI score: {max_feature} ({mi_scores[max_feature]})")


Highest MI score: lead_source (0.04)


Answer: lead_source

## Question 4

In [46]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [47]:
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

In [48]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [49]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [50]:
dict(zip(dv.get_feature_names_out(),model.coef_.round(3)))

{'annual_income': array([-0.   , -0.015,  0.034,  0.003,  0.012, -0.103, -0.025,  0.049,
        -0.02 , -0.013, -0.003, -0.009, -0.032, -0.016,  0.311,  0.051,
         0.02 , -0.012, -0.012, -0.115,  0.08 , -0.03 ,  0.004, -0.011,
        -0.011, -0.006,  0.008,  0.006, -0.033, -0.025,  0.454])}

In [51]:
y_pred = model.predict(X_val)

In [52]:
accuracy = (y_pred == y_val).mean()
print(f"Accuracy: {round(accuracy, 3)}")

Accuracy: 0.7


In [53]:
accuracy

np.float64(0.6996587030716723)

In [58]:
y_train_pred = model.predict(X_train)

In [59]:
accuracy_train = (y_train_pred == y_train).mean()
accuracy_train

np.float64(0.7385844748858448)

Answer: 0.74 (0.699 for validation)

## Question 5

In [55]:
baseline_accuracy = (model.predict(X_val) == y_val).mean()
baseline_accuracy

np.float64(0.6996587030716723)

In [56]:
features_to_test = ['industry', 'employment_status', 'lead_score']
differences = {}

for feature in features_to_test:
    # Create datasets without this feature
    df_train_subset = df_train.drop(columns=[feature])
    df_val_subset = df_val.drop(columns=[feature])
    
    train_dicts_subset = df_train_subset.to_dict(orient='records')
    val_dicts_subset = df_val_subset.to_dict(orient='records')
    
    # Encode and train
    dv_subset = DictVectorizer(sparse=False)
    X_train_subset = dv_subset.fit_transform(train_dicts_subset)
    X_val_subset = dv_subset.transform(val_dicts_subset)
    
    model_subset = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_subset.fit(X_train_subset, y_train)
    
    accuracy_without = (model_subset.predict(X_val_subset) == y_val).mean()
    
    # Calculate difference
    diff = baseline_accuracy - accuracy_without
    differences[feature] = diff
    
    print(f"Without '{feature}': {accuracy_without:.4f} | Difference: {diff:.4f}")

# Find feature with smallest difference
min_feature = min(differences, key=lambda k: abs(differences[k]))
print(f"\nSmallest difference: '{min_feature}' ({differences[min_feature]:.4f})")

Without 'industry': 0.6997 | Difference: 0.0000
Without 'employment_status': 0.6962 | Difference: 0.0034
Without 'lead_score': 0.7065 | Difference: -0.0068

Smallest difference: 'industry' (0.0000)


Answer: industry

## Question 6

In [61]:
C_values = [0.00001, 0.01, 0.1, 1, 10, 100]
results = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    accuracy = (y_pred == y_val).mean()
    results[C] = round(accuracy, 3)
    
    print(f"C={C:6.2f} | Accuracy: {results[C]:.3f}")

best_C = max(results, key=results.get)
print(f"\nBest C: {best_C} with accuracy: {results[best_C]:.3f}")

C=  0.00 | Accuracy: 0.556
C=  0.01 | Accuracy: 0.700
C=  0.10 | Accuracy: 0.700
C=  1.00 | Accuracy: 0.700
C= 10.00 | Accuracy: 0.700
C=100.00 | Accuracy: 0.700

Best C: 0.01 with accuracy: 0.700


Answer: 0.01 (Even though all C values gave 0.700 accuracy, I choose 0.01 because it provides the strongest regularization while maintaining the same performance, making it the most reliable choice for unseen data.)