<a href="https://colab.research.google.com/github/omid-sakaki-ghazvini/Machine-Learning/blob/main/1D_CNN_Binary_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
omidsakaki1370_data_preparation_example_path = kagglehub.dataset_download('omidsakaki1370/data-preparation-example')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer

import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

# **Load Data**

In [None]:
df = pd.read_csv('/kaggle/input/data-preparation-example/credit_risk_dataset.csv')

In [None]:
df.info()

In [None]:
missing_values_count = df.isnull().sum()
missing_values_count

# **Data Analysis**

In [None]:
columns = {
    'loan_status': df['loan_status'],
    'loan_grade': df['loan_grade'],
    'cb_person_default_on_file': df['cb_person_default_on_file'],
    'person_home_ownership': df['person_home_ownership'],
    'loan_intent': df['loan_intent'],
}

def plot_data(name, data):
    plt.figure(figsize=(5, 5))
    palette_color = sns.color_palette('pastel')
    explode = [0.1 for _ in range(data.nunique())]

    target_counts = df.groupby(name)[name].count()

    target_counts.plot.pie(
    colors=palette_color,
    explode=explode,
    autopct="%1.1f%%",
    shadow=True,
    startangle=140,
    textprops={'fontsize': 14},
    wedgeprops={'edgecolor': 'black', 'linewidth': 1.5}
    )

    plt.title(name, fontsize=18, weight='bold')
    plt.axis('equal')
    plt.show()


for name, data in columns.items():
    plot_data(name, data)

# **step 1 : Handle Missing Values & Data Cleaning**

# [Data Cleaning Notebook](http://www.kaggle.com/code/omidsakaki1370/data-cleaning)
# [Missing Values Notebook](http://www.kaggle.com/code/omidsakaki1370/missing-values)

## **1-Normalization**

In [None]:
columns = {'person_income','person_emp_length','loan_amnt','loan_int_rate','loan_percent_income','cb_person_cred_hist_length'}

def distplot_data(data):
    plt.figure(figsize=(8, 4))
    sns.distplot(data);

for i in columns:
    distplot_data(df[i])

In [None]:
fig = plt.figure(figsize=(14, 4))
ax1 = plt.subplot(121)

sns.distplot(df['person_income']);
ax1.set_title("person_income");

df['person_income'] = np.log(df['person_income'])

ax1 = plt.subplot(122)
sns.distplot(df['person_income']);
ax1.set_title("person_income_log");

## **2-Outliers**

In [None]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_out = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df_out

df = remove_outliers(df, 'person_age')
df = remove_outliers(df, 'person_income')
df = remove_outliers(df, 'person_emp_length')
df = remove_outliers(df, 'cb_person_cred_hist_length')
df.reset_index(drop=True, inplace=True)

## **3-Encoder**

In [None]:
encoder=LabelEncoder()

df['person_home_ownership']=encoder.fit_transform(df['person_home_ownership'])
df['loan_intent']=encoder.fit_transform(df['loan_intent'])
df['loan_grade']=encoder.fit_transform(df['loan_grade'])
df['cb_person_default_on_file']=encoder.fit_transform(df['cb_person_default_on_file'])

## **4-missing values**

In [None]:
missing_values_count = df.isnull().sum()
missing_values_count

In [None]:
df = df.fillna(method = 'ffill')
missing_values_count = df.isnull().sum()

print(missing_values_count)
df.head(10)

In [None]:
X = df.copy()
y = X.pop("loan_status")

# **step 2 : Feature Engineering**

# [Feature Engineering Notebook](https://www.kaggle.com/code/omidsakaki1370/feature-engineering)

In [None]:
X_test = X.copy()

In [None]:
from sklearn.feature_selection import VarianceThreshold

selector = VarianceThreshold()
X_new = selector.fit_transform(X_test, y)

print('X shape :', X.shape)
print('X_new shape :', X_new.shape)

## **Creating Features**

In [None]:
X.head(10)

In [None]:
X["New_Features"] = round(X.loan_int_rate - X.person_income).astype(int)

X[["loan_int_rate", "person_income", "New_Features"]].head(10)

In [None]:
def F(x):
    if x>=0:
        return 1
    else:
        return 0

X['New_Features'] = X['New_Features'].apply(F)

print(X['New_Features'].head(10))
print(y.head(10))

## **Clustering With K-Means**

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4)

X["Cluster"] = kmeans.fit_predict(X)
X["Cluster"] = X["Cluster"].astype("category")
X.head()

In [None]:
sns.relplot(x="person_income", y="loan_percent_income", hue="Cluster", data=X, height=4);

# **step 3 : Classification**

In [None]:
#confusion matrix template
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):

    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / float(np.sum(cm))
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

## **1-Split Data**

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

X=df.drop(columns = ['loan_status'],axis=1)
y=df['loan_status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

print("The size of the input train data is: {}".format(X_train.shape))
print("The size of the output train data is: {}".format(y_train.shape))
print("The size of the input test data is: {}".format(X_test.shape))
print("The size of the output test data is: {}".format(y_test.shape))

## **2-Scaler**

In [None]:
scaler_data = StandardScaler()

X_train_scaled = scaler_data.fit_transform(X_train)
X_test_scaled = scaler_data.fit_transform(X_test)

fig = plt.figure(figsize=(10, 4))
sns.distplot(X_train_scaled);

## **3-The network architecture**

In [None]:
# import the necessary libraries
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.Dense(512, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

## **4-The compilation step**

In [None]:
model.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=["accuracy"])

## **5-"Fitting" the model**

In [None]:
history = model.fit(X_train_scaled, y_train, validation_data=(X_test_scaled, y_test), epochs=12, batch_size=128)

## **6-Evaluate the model**

In [None]:
_, train_acc = model.evaluate(X_train_scaled, y_train, verbose=0)
_, test_acc = model.evaluate(X_test_scaled, y_test, verbose=0)
print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

## **7-Plot loss & accuracy during training**

In [None]:
# plot loss during training
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()

# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show()

# **step 4 : 1D Convolutional Neural Network**

## **1-Fit and Evaluate a model**

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Dropout
from keras.layers import Conv1D
from keras.layers import MaxPooling1D

def evaluate_model(trainX, trainy, testX, testy):
    epochs, batch_size = 12, 128
    trainX = trainX.reshape(trainX.shape[0], trainX.shape[1], 1)
    testX = testX.reshape(testX.shape[0], testX.shape[1], 1)
    n_timesteps, n_features = trainX.shape[1], trainX.shape[2]
    model = Sequential()
    model.add(Conv1D(filters=11, kernel_size=3, activation='relu', input_shape=(n_timesteps,n_features)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
    # fit network
    model.fit(trainX, trainy, epochs=epochs, batch_size=batch_size, verbose=0)
    # evaluate model
    _, accuracy = model.evaluate(testX, testy, batch_size=batch_size, verbose=0)
    return accuracy

## **2-Summarize Scores**

In [None]:
from numpy import mean
from numpy import std

def summarize_results(scores):
	print(scores)
	m, s = mean(scores), std(scores)
	print('Accuracy: %.3f%% (+/-%.3f)' % (m, s))

## **3-Run an Experiment**

In [None]:
def run_experiment(repeats=10):
	scores = list()
	for r in range(repeats):
		score = evaluate_model(X_train_scaled, y_train, X_test_scaled, y_test)
		score = score * 100.0
		print('>#%d: %.3f' % (r+1, score))
		scores.append(score)
	# summarize results
	summarize_results(scores)

# run the experiment
run_experiment()