In [1]:
# Import the data
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, balanced_accuracy_score, roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
# Read the S&P 500 CSV data into a DataFrame
df = pd.read_csv("Resources/bank/bank.csv")
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [3]:
df.shape

(4521, 17)

In [4]:
df.info() #check types and for null elements

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [5]:
# Drop rows with null values
df_clean = df.dropna().copy()

# Verify changes with the info method
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


In [6]:

# Convert y to numeric
df_clean['y'] = pd.get_dummies(df_clean['y'], drop_first = True, dtype=int)

#df_clean.loc[df_clean['poutcome']=='success'].head()
df_clean.loc[df_clean['y']==1]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
13,20,student,single,secondary,no,502,no,no,cellular,30,apr,261,1,-1,0,unknown,1
30,68,retired,divorced,secondary,no,4189,no,no,telephone,14,jul,897,2,-1,0,unknown,1
33,32,management,single,tertiary,no,2536,yes,no,cellular,26,aug,958,6,-1,0,unknown,1
34,49,technician,married,tertiary,no,1235,no,no,cellular,13,aug,354,3,-1,0,unknown,1
36,78,retired,divorced,primary,no,229,no,no,telephone,22,oct,97,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4494,26,technician,single,secondary,no,668,yes,no,unknown,28,may,576,3,-1,0,unknown,1
4503,60,self-employed,married,primary,no,362,no,yes,cellular,29,jul,816,6,-1,0,unknown,1
4504,42,blue-collar,single,secondary,no,1080,yes,yes,cellular,13,may,951,3,370,4,failure,1
4505,32,admin.,single,secondary,no,620,yes,no,unknown,26,may,1234,3,-1,0,unknown,1


In [7]:
# Drop all non-numeric columns
df_clean = df_clean.select_dtypes(include='number')

# Verify changes with the info method
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       4521 non-null   int64
 1   balance   4521 non-null   int64
 2   day       4521 non-null   int64
 3   duration  4521 non-null   int64
 4   campaign  4521 non-null   int64
 5   pdays     4521 non-null   int64
 6   previous  4521 non-null   int64
 7   y         4521 non-null   int32
dtypes: int32(1), int64(7)
memory usage: 265.0 KB


In [8]:
# Split training and testing sets
# Create the features DataFrame, X
X = df_clean.copy()
X = X.drop(columns='y')

# Create the target DataFrame, y
y = df_clean['y']

In [9]:
from sklearn.linear_model import LogisticRegression


# Create a `LogisticRegression` function and assign it 
# to a variable named `logistic_regression_model`.
logistic_regression_model = LogisticRegression()

In [26]:
# Define target vector
#y_array = df_clean["y"].values.reshape(-1, 1)
#df_clean.loc[df_clean['poutcome']=='success'].head()
#y.loc[y==1]
#y_array = y.values.reshape(-1, 1)
#y_array[:5]
y_array = y.values

In [27]:
# Use train_test_split to separate the data
#X_train, X_test, y_train, y_test = train_test_split(X, y)
X_train, X_test, y_train, y_test = train_test_split(X, y_array, random_state=78)

In [28]:
X_train

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
685,46,1550,16,126,2,149,8
1586,42,100,7,64,2,-1,0
2763,57,10,8,320,3,-1,0
1857,38,434,20,411,1,-1,0
3153,46,328,21,526,8,-1,0
...,...,...,...,...,...,...,...
2163,51,1490,18,227,2,-1,0
3476,33,625,28,410,1,-1,0
2744,57,0,9,540,1,-1,0
4136,31,430,25,137,16,-1,0


In [40]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_train_scaled

array([[ 0.44791886,  0.05348427,  0.00763608, ..., -0.25722779,
         1.05155416,  4.38034552],
       [ 0.06887619, -0.46692918, -1.0913115 , ..., -0.25722779,
        -0.41395726, -0.32477847],
       [ 1.49028619, -0.49923071, -0.96920621, ...,  0.0889422 ,
        -0.41395726, -0.32477847],
       ...,
       [ 1.49028619, -0.50281977, -0.84710093, ..., -0.60339779,
        -0.41395726, -0.32477847],
       [-0.97349115, -0.34849026,  1.10658367, ...,  4.58915215,
        -0.41395726, -0.32477847],
       [-0.12064514,  2.71620664,  0.12974137, ..., -0.60339779,
        -0.41395726, -0.32477847]])

In [41]:
# Scaling the X data by using StandardScaler()
scaler = StandardScaler().fit(X_test)
X_test_scaled = scaler.transform(X_test)
X_test_scaled

array([[-0.64514265, -0.25134824,  0.25563077, ..., -0.25623994,
        -0.38710673, -0.30720408],
       [-1.30333779,  0.02179746,  1.68161549, ...,  0.5570642 ,
        -0.38710673, -0.30720408],
       [ 0.29513613, -0.16345206,  1.56278343, ..., -0.25623994,
        -0.38710673, -0.30720408],
       ...,
       [ 0.67124764, -0.17290925,  1.56278343, ..., -0.25623994,
        -0.38710673, -0.30720408],
       [-1.39736567,  0.65014382, -0.33852953, ..., -0.52734132,
        -0.38710673, -0.30720408],
       [ 1.61152642, -0.2012808 , -0.45736159, ...,  0.28596282,
        -0.38710673, -0.30720408]])

In [42]:
y_train[:5]

array([0, 0, 0, 0, 0])

In [43]:
# Fit the model
#logistic_regression_model.fit(X_train_scaled, y_train)
#logistic_regression_model.fit(X_train, y_train)
logistic_regression_model.fit(X_train_scaled, y_train)

In [44]:
# Score the model
print(f"Training Data Score: {logistic_regression_model.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {logistic_regression_model.score(X_test_scaled, y_test)}")

Training Data Score: 0.8808259587020649
Testing Data Score: 0.9045092838196287


In [None]:
Score the model
print(f"Training Data Score: {logistic_regression_model.score(X_train, y_train)}")
print(f"Testing Data Score: {logistic_regression_model.score(X_test, y_test)}")

In [9]:
#X = df_clean.copy()
#X.drop("y", axis=1, inplace=True)
#X.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,30,1787,19,79,1,-1,0
1,33,4789,11,220,1,339,4
2,35,1350,16,185,1,330,1
3,30,1476,3,199,4,-1,0
4,59,0,5,226,1,-1,0


In [21]:
#y.value_counts()

y
0    4000
1     521
Name: count, dtype: int64

In [20]:
# Create a Logistic Regression Model
#classifier = LogisticRegression()

# Fit (train) or model using the training data
#classifier.fit(X, y)

# Calculate the accuracy of the model
#classifier.score(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8823269188232692

In [22]:
# Make predictions on the test data
#predictions = classifier.predict(X)

# Create a confusion matrix
#print(confusion_matrix(y, predictions, labels = [1,0]))

[[  73  448]
 [  84 3916]]
