In [1]:
import pandas as pd

In [3]:
# Load the dataset(step 1 loading data and prepare)
df = pd.read_csv('bank.csv')

In [None]:
# Display the first few rows of the dataset
df.head()

In [None]:
# Check for missing values
df.isnull().sum()


In [None]:
# Get an overview of the data types
df.info()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Drop missing values
df = df.dropna()

In [None]:
# step 2 EDA Exploratory Data Analysis
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Check the column names to see how they are formatted
print(df.columns)


In [None]:
# Clean up column names by stripping whitespace or unwanted characters
df.columns = df.columns.str.strip().str.replace('"', '')

In [None]:
# Verify that column names are now clean
print(df.columns)

In [None]:
print(df.columns)

In [None]:
# Manually rename the columns
df.rename(columns={"\"age\"": "age", "age;": "age"}, inplace=True)

# Verify the change
print(df.columns)

In [None]:
print(df.head)

In [None]:
# Re-read the dataset with semicolon separator
df = pd.read_csv('bank.csv', sep=';')

# Verify if columns are separated correctly now
print(df.columns)


In [None]:
sns.histplot(df['age'], bins=10)
plt.title('Age Distribution')
plt.show()

In [None]:
# Check the column names in your dataset
print(data.columns)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset
data = pd.read_csv('bank.csv', delimiter=';')

# Inspect the first few rows to make sure the data is loaded
print(data.head())
# Step 2: Plot Age vs Balance
plt.figure(figsize=(10,6))
sns.scatterplot(x='age', y='balance', data=data)
plt.title('Age vs Balance')
plt.show()

In [None]:
sns.countplot(x='job', data=data)
plt.title('Job Distribution')
plt.xticks(rotation=90)  # Rotate labels for better readability
plt.show()

In [None]:
sns.countplot(x='marital', data=data)
plt.title('Marital Status Distribution')
plt.show()

In [None]:
sns.countplot(x='education', data=data)
plt.title('Education Distribution')
plt.show()

In [None]:
# Job vs Education (Count Plot)
plt.figure(figsize=(10,6))
sns.countplot(x='job', hue='education', data=data)
plt.title('Job vs Education')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Job vs Campaign (Box Plot)
plt.figure(figsize=(10,6))
sns.boxplot(x='job', y='campaign', data=data)
plt.title('Job vs Campaign')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Job vs Housing (Count Plot)
plt.figure(figsize=(10,6))
sns.countplot(x='job', hue='housing', data=data)
plt.title('Job vs Housing')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Correlation Matrix

In [None]:
import seaborn as sns
corr_matrix = data.corr()
plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Target Variable Distribution

In [None]:
sns.countplot(x='y', data=data)
plt.title('Subscription Outcome Distribution')
plt.show()

In [None]:
# Relationship Between Categorical Features and Target Variable

In [None]:
sns.countplot(x='job', hue='y', data=data)
plt.title('Subscription Outcome by Job')
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.countplot(x='marital', hue='y', data=data)
plt.title('Subscription Outcome by Marital Status')
plt.show()

In [None]:
#  Numerical Variables vs Target Variable

In [None]:
sns.boxplot(x='y', y='balance', data=data)
plt.title('Balance vs Subscription Outcome')
plt.show()

In [None]:
sns.boxplot(x='y', y='duration', data=data)
plt.title('Duration vs Subscription Outcome')
plt.show()

In [None]:
# STEP 3 1. Identify Categorical and Numerical Variables, 2. Handle Missing Values,3. Encode Categorical Variables, 4. Encode the Target Variable, 5. Scale Numerical Features, 6. Feature Selection (Optional but Recommended), 7. Final Data Inspection

In [None]:
# Encode categorical features(step 3 data prepocessing)
from sklearn.preprocessing import LabelEncoder

In [None]:
# Handling missing data
# Option 1: Remove rows with missing values
data_encoded = data_encoded.dropna()

# Define numerical and categorical columns
numerical_columns = data_encoded.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = data_encoded.select_dtypes(include=['object', 'category']).columns

# Now proceed with the missing value handling as before

# Imputation for numerical data
num_imputer = SimpleImputer(strategy='mean')
data_encoded[numerical_columns] = num_imputer.fit_transform(data_encoded[numerical_columns])

# Imputation for categorical data
cat_imputer = SimpleImputer(strategy='most_frequent')
data_encoded[categorical_columns] = cat_imputer.fit_transform(data_encoded[categorical_columns])



In [None]:
# Ensure numerical columns are defined
numerical_columns = data_encoded.select_dtypes(include=['int64', 'float64']).columns

# Outlier detection using IQR (Interquartile Range)
Q1 = data_encoded[numerical_columns].quantile(0.25)
Q3 = data_encoded[numerical_columns].quantile(0.75)
IQR = Q3 - Q1

# Define outliers as points outside 1.5*IQR from Q1 and Q3
outliers = (data_encoded[numerical_columns] < (Q1 - 1.5 * IQR)) | (data_encoded[numerical_columns] > (Q3 + 1.5 * IQR))

# Option 1: Removing outliers
# This will remove rows with outliers in any numerical column
data_encoded = data_encoded[~outliers.any(axis=1)]

# Option 2: You can also handle outliers differently, such as capping them
# Example for capping:
# data_encoded[numerical_columns] = data_encoded[numerical_columns].apply(
#     lambda x: np.where(x < (Q1 - 1.5 * IQR), Q1 - 1.5 * IQR, np.where(x > (Q3 + 1.5 * IQR), Q3 + 1.5 * IQR, x))
# )


In [None]:
# Print the first few rows of encoded dataset
df_encoded = pd.get_dummies(df, drop_first=True)
print(df_encoded.head())

In [None]:
# Features (X) and Target (y)
X = df_encoded.drop('y_yes', axis=1)  # 'y_yes' is the target variable (1 for yes, 0 for no)
y = df_encoded['y_yes']

In [None]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Import the necessary module for scaling
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the Logistic Regression model with scaled data
model = LogisticRegression(max_iter=1000)
model.fit(X_train_scaled, y_train)

In [None]:

# Predictions
y_pred = model.predict(X_test)

In [None]:
# Import evaluation metrics
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [None]:
print(data.columns)


In [None]:
# Assuming 'data' is your DataFrame
data = data[data.columns[0]].str.split(';', expand=True)


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load your dataset DATA LOADINNG ND INITIAL PREPARATION
# Sample dataframe for demonstration (replace this with your actual data loading step)
data = pd.DataFrame({
    'data': ['30;unemployed;married;primary;no;1787;no;no;cellular;19;oct;79;1;-1;0;unknown;no',
             '33;services;married;secondary;no;4789;yes;yes;cellular;11;may;220;1;339;4;failure;no',
             '35;management;single;tertiary;no;1350;yes;no;cellular;16;apr;185;1;330;1;failure;no']
})

# Split the single column into multiple columns based on the semicolon (;) DATA CLEANING
data = data['data'].str.split(';', expand=True)

# Assign the correct column names
data.columns = ['age', 'job', 'marital', 'education', 'default', 'balance',
                'housing', 'loan', 'contact', 'day', 'month', 'duration',
                'campaign', 'pdays', 'previous', 'poutcome', 'y']

# Convert numeric columns from string to their appropriate types
data['age'] = data['age'].astype(int)
data['balance'] = data['balance'].astype(float)
data['day'] = data['day'].astype(int)
data['duration'] = data['duration'].astype(int)
data['campaign'] = data['campaign'].astype(int)
data['pdays'] = data['pdays'].astype(int)
data['previous'] = data['previous'].astype(int)

# List of categorical columns for encoding
categorical_cols = ['job', 'marital', 'education', 'default', 'housing',
                    'loan', 'contact', 'month', 'poutcome', 'y']  # Include 'y' if it's categorical

# One-Hot Encoding using pandas.get_dummies FEATURE ENGINEERING
data_encoded = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Inspect the new columns
print("Encoded Data Columns:")
print(data_encoded.columns)  # Check if 'y' is included

# Check for correlation EXPLORATORY DATA ANALYSIS(EDA)
plt.figure(figsize=(12, 8))
corr_matrix = data_encoded.corr()

# Print the correlation matrix to debug
print("Correlation Matrix:")
print(corr_matrix)  # Check the correlation matrix

# Now plot the heatmap
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')  # Set annot=True to see the correlation values
plt.title('Correlation Matrix')
plt.show()

# Ensure 'y' is in the correlation matrix before trying to access it FEATURE SELECTION
if 'y' in corr_matrix.columns:
    # Identify features with low correlation with target 'y'
    low_corr_features = corr_matrix['y'][corr_matrix['y'].abs() < 0.05].index.tolist()
    print("Low correlation features:", low_corr_features)

    # Drop low correlation features if necessary
    data_encoded.drop(low_corr_features, axis=1, inplace=True)
else:
    print("'y' is not present in the correlation matrix.")


In [None]:
pip install category_encoders


In [None]:
print(data_encoded.columns)


In [None]:
print(data_encoded.columns)


In [None]:
import numpy as np

# Create a synthetic target for binary classification (0 or 1)
data_encoded['target'] = np.random.choice([0, 1], size=len(data_encoded))

# Now proceed with encoding
from category_encoders import TargetEncoder
target_encoder = TargetEncoder()
data_encoded['job_services_encoded'] = target_encoder.fit_transform(data_encoded['job_services'], data_encoded['target'])



In [None]:
print(data_encoded.dtypes)
# Ensure 'job_services' is in the correct format for target encoding
data_encoded['job_services'] = data_encoded['job_services'].astype('category')

# Apply target encoding
from category_encoders import TargetEncoder
target_encoder = TargetEncoder()
data_encoded['job_services_encoded'] = target_encoder.fit_transform(data_encoded['job_services'], data_encoded['target'])


In [None]:
# Identifying numerical columns
numerical_columns = data_encoded.select_dtypes(include=['int64', 'float64']).columns

# Scaling numerical data
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Option 1: Standardization (mean=0, std=1)
scaler = StandardScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])

# Option 2: Min-Max scaling (scaling to [0, 1])
scaler = MinMaxScaler()
data_encoded[numerical_columns] = scaler.fit_transform(data_encoded[numerical_columns])


In [None]:
# MODEL SELECTION AND TRAINING

In [None]:
# Splitting the Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = data_encoded.drop(columns=['target'])  # Drop the target column from the features
y = data_encoded['target']  # Target variable

# Split the data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [None]:
# Model Selection:

In [None]:
print(X_train.columns)



In [None]:
print(y_train.unique())

# Check unique classes in the target variable
print(y_train.value_counts())

# If there's only one class, consider resampling or obtaining more data
if y_train.nunique() < 2:
    print("Only one class found in y_train. Please check your dataset.")
else:
    # Proceed to fit the model
    model.fit(X_train, y_train)


In [None]:
print(X_train.columns)



In [None]:
# Check the columns to ensure you know what you're working with
print(X_train.columns)







In [None]:
print(X.columns)  # Assuming X is your original features DataFrame


In [None]:
import pandas as pd

# Print the columns to check their names
print("X_train columns:", X_train.columns)
print("X_test columns:", X_test.columns)

# Assuming the column exists and is correctly named:
if 'job_services' in X_train.columns and 'job_services' in X_test.columns:
    # Split the job_services column by commas and explode to create separate rows
    X_train['job_services'] = X_train['job_services'].str.split(',')
    X_test['job_services'] = X_test['job_services'].str.split(',')

    # Now, explode the lists into separate rows
    X_train_exploded = X_train.explode('job_services')
    X_test_exploded = X_test.explode('job_services')

    # Proceed with encoding or further processing
else:
    print("Column 'job_services' is missing in either X_train or X_test.")





In [None]:
# Install necessary packages if you haven't already
!pip install pandas numpy scikit-learn matplotlib


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Split the dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Check the column names in the dataset
print(data.columns)


In [None]:
import pandas as pd

# Read the CSV file with the correct delimiter
data = pd.read_csv('bank.csv', sep=';')

# Check the corrected column names
print(data.columns)


In [None]:
# Define your features (X) and target (y)
X = data.drop('y', axis=1)  # 'y' is the target column
y = data['y']

# Now split the data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of X_train to verify
print(f'Number of training samples: {X_train.shape[0]}')


In [None]:
# Hyperparameter Tuning:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load your dataset (ensure to use the correct delimiter)
df = pd.read_csv('bank.csv', sep=';')  # Use ';' as the separator if needed

# Print columns to verify
print("Columns in the dataset:")
print(df.columns)

# Target column
target_column = 'y'  # The target column is confirmed to be 'y'

# Define features and target variable
X = df.drop(columns=[target_column])  # Drop the target column to get features
y = df[target_column]  # Target variable

# Convert categorical variables to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate the model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Hyperparameter tuning
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300]
}

grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

print("Best Hyperparameters:")
print(grid_search.best_params_)

best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test_scaled)

print("Confusion Matrix for Best Model:")
print(confusion_matrix(y_test, y_pred_best))

print("\nClassification Report for Best Model:")
print(classification_report(y_test, y_pred_best))


In [None]:
print(df.columns)


In [None]:
# Model Evaluation:

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load your dataset (ensure to use the correct delimiter)
df = pd.read_csv('bank.csv', sep=';')  # Use ';' as the separator if needed

# Print columns to verify
print("Columns in the dataset:")
print(df.columns)

# Target column
target_column = 'y'  # The target column is confirmed to be 'y'

# Define features and target variable
X = df.drop(columns=[target_column])  # Drop the target column to get features
y = df[target_column]  # Target variable

# Convert categorical variables to numerical using one-hot encoding
X = pd.get_dummies(X, drop_first=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaled datasets to CSV files
X_train_scaled_df = pd.DataFrame(X_train_scaled, columns=X_train.columns)  # Create DataFrame for training features
X_test_scaled_df = pd.DataFrame(X_test_scaled, columns=X_test.columns)  # Create DataFrame for testing features

# Save the target variables to CSV files
y_train_df = pd.DataFrame(y_train).reset_index(drop=True)
y_test_df = pd.DataFrame(y_test).reset_index(drop=True)

# Combine features and targets for saving
train_data = pd.concat([X_train_scaled_df, y_train_df], axis=1)
test_data = pd.concat([X_test_scaled_df, y_test_df], axis=1)

# Save to CSV
train_data.to_csv('train_data_scaled.csv', index=False)
test_data.to_csv('test_data_scaled.csv', index=False)

print("Training and testing datasets saved successfully.")


In [None]:
print(data_encoded.columns)


In [None]:
# Load your dataset
data = pd.read_csv('bank.csv')  # Replace with your actual dataset path

# Check the columns of the dataset
print("Columns in the dataset:")
print(data.columns.tolist())  # List all column names to find the target variable

# Strip whitespace from column names if needed
data.columns = data.columns.str.strip()

# Use the actual target column name after checking the output above
target_column = 'Outcome'  # Replace with the actual target column name

# Check if the target column exists
if target_column not in data.columns:
    print(f"Target column '{target_column}' not found. Please check the column names.")
else:
    # Split data into features and target variable
    X = data.drop(target_column, axis=1)  # Use the actual target column name
    y = data[target_column]  # This should be your target variable

    # Split the dataset into training and test sets with a smaller test size
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)  # Adjust as needed

    # Train a Random Forest classifier
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train, y_train)

    # Cross-validation scores with adjusted n_splits
    cv_scores = cross_val_score(model, X_train, y_train, cv=2)  # Use cv=2
    print(f'Cross-validation scores: {cv_scores}')
    print(f'Mean CV score: {np.mean(cv_scores)}')


In [None]:
# Deployment and Decision Support

In [None]:
import pandas as pd

df = pd.read_csv('bank.csv')  # Replace with your dataset path


In [None]:
# 1. Export Model Predictions to Excel

In [None]:
import pandas as pd

# Assuming `y_pred` contains your predictions and `X_test` contains your test features
predictions_df = X_test.copy()
predictions_df['Predicted'] = y_pred  # Add predictions to the DataFrame

# Save to Excel
predictions_df.to_excel('model_predictions.xlsx', index=False)
print("Predictions exported to 'model_predictions.xlsx'")


In [None]:
# 2. Visualize Key Insights

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()


In [None]:
# 3. Deploying the Model

In [None]:
import joblib
import pandas as pd
from sklearn.linear_model import LogisticRegression  # Example model
from sklearn.model_selection import train_test_split

# Load your data
train_data = pd.read_csv('train_data_scaled.csv')
test_data = pd.read_csv('test_data_scaled.csv')

# Assume 'y' is the target column and the rest are features
X_train = train_data.drop('y', axis=1)  # Features
y_train = train_data['y']               # Target

# Train the model (example using Logistic Regression)
model = LogisticRegression()
model.fit(X_train, y_train)

# Save the trained model to a .pkl file
joblib.dump(model, 'model_predictions.pkl')

# You can load it again like this:
# loaded_model = joblib.load('model_predictions.pkl')


['model_predictions.pkl']

In [7]:
import pandas as pd

# Load your data
train_data = pd.read_csv('train_data_scaled.csv')

# Check the column names in the dataset
print(train_data.columns)


Index(['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous',
       'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_married', 'marital_single', 'education_secondary',
       'education_tertiary', 'education_unknown', 'default_yes', 'housing_yes',
       'loan_yes', 'contact_telephone', 'contact_unknown', 'month_aug',
       'month_dec', 'month_feb', 'month_jan', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'poutcome_other', 'poutcome_success', 'poutcome_unknown', 'y'],
      dtype='object')


In [2]:
from flask import Flask, request, jsonify
import pickle
import numpy as np

app = Flask(__name__)

# Load the trained model
with open('model_predictions.pkl', 'rb') as model_file:
    model = pickle.load(model_file)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    # Convert input data to a numpy array
    input_data = np.array(data['features']).reshape(1, -1)
    # Make prediction
    prediction = model.predict(input_data)
    return jsonify({'prediction': prediction[0]})

if __name__ == '__main__':
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
