In [1]:
!pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
import numpy as np
import pandas as pd
from ucimlrepo import fetch_ucirepo

bank_marketing = fetch_ucirepo(id=222)

X = bank_marketing.data.features
y = bank_marketing.data.targets

In [3]:
bank_data = pd.read_csv('bank.csv', sep=';')

FileNotFoundError: [Errno 2] No such file or directory: 'bank.csv'

In [None]:
binary_col = ['default', 'housing', 'loan', 'y']
for col in binary_col:
    bank_data[col] = bank_data[col].map({'yes': 1, 'no': 0})

In [None]:
columns_with_unknown = ['job', 'education', 'contact', 'poutcome']
for col in columns_with_unknown:
    count = 0
    for value in bank_data[col]:
        if value == 'unknown':
            count += 1
    print(count/len(bank_data[col])*100)

0.8405220084052201
4.13625304136253
29.285556292855563
81.95089581950896


In [None]:
bank_data.drop('poutcome',
  axis='columns', inplace=True)
bank_data.drop('contact',
  axis='columns', inplace=True)
bank_data.drop('month',
  axis='columns', inplace=True)
bank_data.drop('day',
  axis='columns', inplace=True)

In [None]:
columns_with_unknown = ['job', 'education']

for col in columns_with_unknown:
    mode_value = bank_data[bank_data[col] != 'unknown'][col].mode()[0]
    bank_data[col] = bank_data[col].replace('unknown', mode_value)

In [None]:
bank_data['pdays_contacted'] = bank_data['pdays'].apply(lambda x: 0 if x == -1 else 1)
bank_data['pdays'] = bank_data['pdays'].replace(-1, 0)

In [None]:
numerical_cols = ['age', 'balance', 'duration']

for col in numerical_cols:
    Q1 = bank_data[col].quantile(0.1)
    Q3 = bank_data[col].quantile(0.9)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    bank_data = bank_data[(bank_data[col] >= lower_bound) & (bank_data[col] <= upper_bound)]

In [None]:
bank_data['job'] = bank_data.job.map({
    'admin.': 1, 'management': 1, 'services': 0, 'blue-collar': 0, 'technician': 1, 'housemaid': 0, 'retired': 0, 'self-employed': 1, 'entrepreneur': 1, 'unemployed': 0, 'student': 0,
})
bank_data['education'] = bank_data.education.map({
    'primary': 0, 'secondary': 0, 'tertiary': 1,
})
bank_data['marital'] = bank_data.marital.map({
    'single': 0, 'married': 1, 'divorced': 0,
})

bank_data.rename(columns={'job': 'highpjob', 'education': 'heducated', 'martial' : 'merried'}, inplace=True)

In [None]:
!pip install ydata_profiling
from ydata_profiling import ProfileReport



In [None]:
profile = ProfileReport(bank_data,title="Bank-marketing data")
profile.to_file("adult_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]


  0%|          | 0/14 [00:00<?, ?it/s][A
 36%|███▌      | 5/14 [00:00<00:00, 33.47it/s][A
 64%|██████▍   | 9/14 [00:00<00:00, 31.03it/s][A
100%|██████████| 14/14 [00:00<00:00, 30.72it/s]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

#Wniosek po EDA
Pozbyłem się miesięcy i dni ostatniego konatku, gdyż uważam to za zbytnio nieskorelowane, a onehot encoding dla 12*(+-30) to sporo więcej zmiennych. Mało widoczne są korelacja pomiędzy y, a innymi zmienny z wyjątkiem czasu trawania kontatku. Usunąłem również metode kontaktu, gdyż cellular i telephone się znacząco nie różnią oraz 29% danych jest nieznanych. Usunąłem też poutcome z uwagi na  82% brakujących danych.

In [None]:
y = bank_data['y']
X = bank_data.drop('y', axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import tree, svm

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# Decision Tree
model2 = tree.DecisionTreeRegressor(random_state=42)
model2 = model2.fit(X_train_scaled, y_train)
y_pred2 = model2.predict(X_test_scaled)

# SVM
model3 = svm.SVC(random_state=42)
model3.fit(X_train_scaled, y_train)
y_pred3 = model3.predict(X_test_scaled)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)

accuracy2 = accuracy_score(y_test, y_pred2)
precision2 = precision_score(y_test, y_pred2)

accuracy3 = accuracy_score(y_test, y_pred3)
precision3 = precision_score(y_test, y_pred3)

print(f"Accuracy Logistic Regression (Scaled): {accuracy:.4f}")
print(f"Precision Logistic Regression (Scaled): {precision:.4f}")
print(f"")
print(f"Accuracy Decision Tree (Scaled): {accuracy2:.4f}")
print(f"Precision Decision Tree (Scaled): {precision2:.4f}")
print(f"")
print(f"Accuracy SVM (Scaled): {accuracy3:.4f}")
print(f"Precision SVM (Scaled): {precision3:.4f}")

Accuracy Logistic Regression (Scaled): 0.8985
Precision Logistic Regression (Scaled): 0.5556
Recall Logistic Regression (Scaled): 0.1190
F1-Score Logistic Regression (Scaled): 0.1961

Accuracy Decision Tree (Scaled): 0.8577
Precision Decision Tree (Scaled): 0.3495
Recall Decision Tree (Scaled): 0.4286
F1-Score Decision Tree (Scaled): 0.3850

Accuracy SVM (Scaled): 0.8973
Precision SVM (Scaled): 0.5714
Recall SVM (Scaled): 0.0476
F1-Score SVM (Scaled): 0.0879


#Wnioski
Nie umiem jeszcze tego XDD. A tak serio to starałem się jak  najbardziej poprawić precyzję, lecz niestety nie potrafiłem. Starałem się znormalizować outliery, niestety nie pomogło. Najlepiej zadziałał model SVM. Precyzja każdego modelu nie jest wysoka co wskazuje, iż modele często błędnie przewidują Wykonałem to zadanie nie poprawnie, lecz niestety ten tydzień był dla mnie ciężki prywatnie i na uczelni, stąd mało miałem czasu na to aby zgłębić temat za co przepraszam. Wolę oddać coś niedokończonego niż nic.