In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
train_df = pd.read_csv('termdeposit_train.csv')
train_df.head()

Unnamed: 0,ID,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,subscribed
0,26110,56,admin.,married,unknown,no,1933,no,no,telephone,19,nov,44,2,-1,0,unknown,no
1,40576,31,unknown,married,secondary,no,3,no,no,cellular,20,jul,91,2,-1,0,unknown,no
2,15320,27,services,married,secondary,no,891,yes,no,cellular,18,jul,240,1,-1,0,unknown,no
3,43962,57,management,divorced,tertiary,no,3287,no,no,cellular,22,jun,867,1,84,3,success,yes
4,29842,31,technician,married,secondary,no,119,yes,no,cellular,4,feb,380,1,-1,0,unknown,no


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31647 entries, 0 to 31646
Data columns (total 18 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   ID          31647 non-null  int64 
 1   age         31647 non-null  int64 
 2   job         31647 non-null  object
 3   marital     31647 non-null  object
 4   education   31647 non-null  object
 5   default     31647 non-null  object
 6   balance     31647 non-null  int64 
 7   housing     31647 non-null  object
 8   loan        31647 non-null  object
 9   contact     31647 non-null  object
 10  day         31647 non-null  int64 
 11  month       31647 non-null  object
 12  duration    31647 non-null  int64 
 13  campaign    31647 non-null  int64 
 14  pdays       31647 non-null  int64 
 15  previous    31647 non-null  int64 
 16  poutcome    31647 non-null  object
 17  subscribed  31647 non-null  object
dtypes: int64(8), object(10)
memory usage: 4.3+ MB


In [7]:
train_df.describe()

Unnamed: 0,ID,age,balance,day,duration,campaign,pdays,previous
count,31647.0,31647.0,31647.0,31647.0,31647.0,31647.0,31647.0,31647.0
mean,22563.972162,40.957247,1363.890258,15.835466,258.113534,2.765697,39.576042,0.574272
std,13075.93699,10.625134,3028.304293,8.337097,257.118973,3.11383,99.317592,2.422529
min,2.0,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,11218.0,33.0,73.0,8.0,104.0,1.0,-1.0,0.0
50%,22519.0,39.0,450.0,16.0,180.0,2.0,-1.0,0.0
75%,33879.5,48.0,1431.0,21.0,318.5,3.0,-1.0,0.0
max,45211.0,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [8]:
X = train_df.drop(columns=['subscribed', 'ID'])
y = train_df['subscribed'].apply(lambda x: 1 if x == 'yes' else 0)


In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()
num_cols = X_train.select_dtypes(include=['int64']).columns.tolist()

In [11]:
num_transformer = Pipeline(steps=[ ('scaler', StandardScaler())])

cat_transformer = Pipeline(steps=[ ('onehot', OneHotEncoder(handle_unknown='ignore'))])


In [12]:
preprocessor1 = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_cols),
        ('cat', cat_transformer, cat_cols)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor1),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [13]:
model.fit(X_train, y_train)


Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  ['age', 'balance', 'day',
                                                   'duration', 'campaign',
                                                   'pdays', 'previous']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['job', 'marital',
                                                   'education', 'default',
                                                   'housing', 'loan', 'contact',
                                                   

In [14]:

model.score(X_val, y_val)

0.906477093206951

In [15]:
test_df = pd.read_csv("termdeposit_test.csv")

In [17]:
test_ids= test_df['ID']

In [18]:
X_test = test_df.drop(columns=['ID'])
predicted_sub = model.predict(X_test)


In [19]:
prediction_outcome = pd.DataFrame({
    'ID': test_ids,
    'Subscribed': predicted_sub
})
prediction_outcome['Subscribed'] = prediction_outcome['Subscribed'].map({1: 'yes', 0: 'no'})

In [20]:
prediction_outcome.head()


Unnamed: 0,ID,Subscribed
0,38441,no
1,40403,no
2,3709,no
3,37422,no
4,12527,no


The majority of the clients sampled are expected not to subscribe to a term deposit, according to the model's initial predictions when applied to the test dataset. This implies that either the bank's present marketing techniques are less likely to have an impact on these particular clients, or the current state of the market may make term deposits unappealing to them.


The bank needs to know this information since it can improve their targeting approach. They could think about:

modifying their strategy to attract these clients with term deposits.
Their marketing campaigns' client selection criteria have been revised to target individuals who are more inclined to subscribe.
Considering the model's high accuracy on the validation set, these forecasts are probably accurate. 
It would be beneficial for the bank to continuously refine the model and include more diverse data over time to adapt to changing customer behaviors and market conditions.