## Imports

In [7]:
import numpy as np
import pandas as pd
import seaborn as sns

In [8]:
df = pd.read_csv('data/train.csv', sep=';') #load the data

## Understanding the Data

### Column Descriptions:
1. **age**: Numeric value representing the age of the client.
2. **job**: Type of job (categorical):
   - "admin."
   - "unknown"
   - "unemployed"
   - "management"
   - "housemaid"
   - "entrepreneur"
   - "student"
   - "blue-collar"
   - "self-employed"
   - "retired"
   - "technician"
   - "services"
3. **marital**: Marital status (categorical):
   - "married"
   - "divorced" (includes divorced or widowed)
   - "single"
4. **education**: Education level (categorical):
   - "unknown"
   - "secondary"
   - "primary"
   - "tertiary"
5. **default**: Whether the client has credit in default (binary):
   - "yes"
   - "no"
6. **balance**: Average yearly balance in euros (numeric).
7. **housing**: Whether the client has a housing loan (binary):
   - "yes"
   - "no"
8. **loan**: Whether the client has a personal loan (binary):
   - "yes"
   - "no"

**Related to the last contact of the current campaign:**
9. **contact**: Contact communication type (categorical):
   - "unknown"
   - "telephone"
   - "cellular"
10. **day**: Last contact day of the month (numeric).
11. **month**: Last contact month of the year (categorical):
   - "jan"
   - "feb"
   - "mar"
   - "apr"
   - "may"
   - "jun"
   - "jul"
   - "aug"
   - "sep"
   - "oct"
   - "nov"
   - "dec"
12. **duration**: Last contact duration, in seconds (numeric).

**Other Attributes:**
13. **campaign**: Number of contacts performed during this campaign for this client (numeric, includes the last contact).
14. **pdays**: Number of days since the client was last contacted from a previous campaign (numeric, -1 means the client was not previously contacted).
15. **previous**: Number of contacts performed before this campaign for this client (numeric).
16. **poutcome**: Outcome of the previous marketing campaign (categorical):
   - "unknown"
   - "other"
   - "failure"
   - "success"

**Output Variable (Desired Target):**
17. **y**: Whether the client subscribed to a term deposit (binary):
   - "yes"
   - "no"

**Missing Attribute Values**: None
Attribute Values: None


In [9]:
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [10]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB
None


In [11]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


## Preprocessing & EDA

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
le = LabelEncoder()
df['job_encoded'] = le.fit_transform(df['job'])
df['marital_status_encoded'] = le.fit_transform(df['marital'])
df['poutcome_encoded'] = le.fit_transform(df['poutcome'])
df.drop(['job_encoded', 'marital_status_encoded', 'poutcome_encoded'], axis=1, inplace=True)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y,job_encoded,marital_status_encoded,poutcome_encoded
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no,4,1,3
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no,9,2,3
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no,2,1,3
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no,1,1,3
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no,11,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes,9,1,3
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes,5,0,3
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes,5,1,2
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no,1,1,3


## Model 1: 

## Model 2:

## Final Comparisons