In [1]:
#import necessary libraries like Pandas and NumPy.
import pandas as pd
import numpy as np
from datetime import date
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
import warnings
warnings.simplefilter("ignore", UserWarning)

In [2]:
#reads the CSV file using the pd.read_csv() function and assigns it to the data variable
df1 = pd.read_csv("C:\\Users\\Akon.usoh\\Desktop\\PERSONAL\\2023 PROJECTS\\Credit_Card_Prediction.csv")

In [3]:
df = df1.copy()

We need to create a copy of the df1 DataFrame and assign it to df because if we modify df directly without creating a copy, it will also modify the original df1 DataFrame. This can be problematic if we need to compare the modified and original DataFrames or if we need to run additional analysis on the original data. Creating a copy ensures that any changes we make to the new df DataFrame do not affect the original df1 DataFrame.

In [4]:
df.head()

Unnamed: 0,Name,Gender,Phone_Number,Email,Date_of_birth,Occupation,Income,Credit_Score,Credit_Limit,Credit_Card_Balance,Credit_Card_Type
0,Maxwell Dwyer,Male,7-372-875-5034,Maxwell_Dwyer3033@urn0m.center,1/23/1994,IT Support Staff,75563674.03,744,181182,46074,MasterCard
1,Trisha Redden,Female,4-683-143-6436,Trisha_Redden4518@3wbkp.zone,02/02/1996,Retail Trainee,15546374.91,301,62491,146166,VisaGold
2,Chuck Watson,Male,2-112-046-5848,Chuck_Watson4904@urn0m.com,04/06/1982,Steward,46203873.24,835,490051,78059,VisaPlatinum
3,Meredith Bryant,Female,8-742-122-4222,Meredith_Bryant1962@xqj6f.app,04/04/1971,Doctor,56957946.95,331,322623,174758,VisaInfinite
4,Brad Poulton,Male,8-381-385-4144,Brad_Poulton3135@ag5wi.design,12/08/1973,Stockbroker,69927678.15,626,232465,63160,MasterCard


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   Name                 100000 non-null  object 
 1   Gender               100000 non-null  object 
 2   Phone_Number         100000 non-null  object 
 3   Email                100000 non-null  object 
 4   Date_of_birth        100000 non-null  object 
 5   Occupation           100000 non-null  object 
 6   Income               100000 non-null  float64
 7   Credit_Score         100000 non-null  int64  
 8   Credit_Limit         100000 non-null  int64  
 9   Credit_Card_Balance  100000 non-null  int64  
 10  Credit_Card_Type     100000 non-null  object 
dtypes: float64(1), int64(3), object(7)
memory usage: 8.4+ MB


In [6]:
df.shape

(100000, 11)

## DATA CLEANING

#### Drop columns not needed for prediction

The 'Name', 'Phone Number', and 'Email' columns will be dropped from the dataset because they are categorical features that do not provide any useful information for the machine learning model to predict credit card spending. 

The name, phone number, and email address of a customer are unique identifiers, but they do not provide any information that is relevant to the credit card spending patterns of customers.

In machine learning, it is important to remove irrelevant or redundant features from the dataset, as they can negatively impact the accuracy of the model. 

Therefore, it is common practice to drop categorical columns that do not provide useful information for the prediction task.

In [7]:
# Drop columns that are not needed for prediction
df = df.drop(['Name', 'Phone_Number', 'Email'], axis=1)

## FEATURE ENGINEERING
Feature engineering involves creating new features or transform existing ones to improve the performance of the model.

#### Convert Date of Birth column to Age

We converted the 'Date of Birth' column to 'Age' because age is a more meaningful and relevant feature in predicting credit card spending. Knowing a customer's age provides useful information such as their income level, spending habits, and creditworthiness. In addition, age is a continuous variable that can be easily used by machine learning models to make accurate predictions. On the other hand, the 'Date of Birth' column contains a large number of unique values that would be difficult for a machine learning model to use effectively.

In [8]:
# Convert the 'Date of birth' column to a datetime format
df['Date_of_birth'] = pd.to_datetime(df['Date_of_birth'], format='%m/%d/%Y')

# Create a new column 'Age' by subtracting the birth year from the current year
df['Age'] = pd.datetime.now().year - df['Date_of_birth'].dt.year

#Drop the date of birth column
df = df.drop('Date_of_birth', axis=1)

  df['Age'] = pd.datetime.now().year - df['Date_of_birth'].dt.year


### The rationale for selecting the below features is as follows:

Gender: Gender could play a role in spending behavior, as different genders may have different spending patterns.

Occupation: Occupation could be an indicator of income level and spending behavior. For example, a high-income earner may have a higher credit card balance than a low-income earner.

Income: Income could be a strong predictor of credit card balance, as individuals with higher incomes may have a higher credit card balance.

Credit_Score: Credit score is an important factor that affects credit card balance. Higher credit scores indicate better creditworthiness, which could lead to higher credit limits and higher balances.

Credit_Limit: Credit limit could be a strong predictor of credit card balance, as individuals with higher credit limits may have a higher credit card balance.

Credit_Card_Type: Credit card type could be an important factor, as different credit cards may have different rewards programs, interest rates, or other features that could affect an individual's spending behavior.

Age: Age could be a factor in spending behavior, as younger individuals may have different spending patterns compared to older individuals.

In [9]:
df.head()

Unnamed: 0,Gender,Occupation,Income,Credit_Score,Credit_Limit,Credit_Card_Balance,Credit_Card_Type,Age
0,Male,IT Support Staff,75563674.03,744,181182,46074,MasterCard,29
1,Female,Retail Trainee,15546374.91,301,62491,146166,VisaGold,27
2,Male,Steward,46203873.24,835,490051,78059,VisaPlatinum,41
3,Female,Doctor,56957946.95,331,322623,174758,VisaInfinite,52
4,Male,Stockbroker,69927678.15,626,232465,63160,MasterCard,50


## DATA PREPROCESSING

### Convert categorical variables (Gender, Occupation, Credit Card Type) to numerical

In machine learning algorithms, mathematical operations are performed on the data to train a model and make predictions. 

Most machine learning models are designed to work with numerical data, and cannot directly work with categorical data.

To address this issue, categorical data must be converted into numerical form. There are different techniques to do this conversion such as one-hot encoding, ordinal encoding, and label encoding. 

In this case, label encoding was used to convert the categorical variables (Gender, Occupation, Credit Card Type) to numerical form.

Label encoding assigns a unique integer to each category in a feature. This allows the algorithm to treat the categorical data as numerical data and perform mathematical operations on them. 

For example, in the Gender feature, 'Male' was assigned the integer value of 1 and 'Female' was assigned the integer value of 0. In the Occupation feature, each occupation was assigned a unique integer. 

This encoding makes it easier for the algorithm to recognize patterns and make predictions.

In [10]:
# Preprocessing
le = LabelEncoder()

# perform label encoding on categorical features
df['Gender'] = le.fit_transform(df['Gender'])
df['Occupation'] = le.fit_transform(df['Occupation'])
df['Credit_Card_Type'] = le.fit_transform(df['Credit_Card_Type'])

In [11]:
# select features and target
features = ['Income', 'Credit_Score', 'Credit_Limit', 'Age', 'Gender', 'Occupation', 'Credit_Card_Type']
target = 'Credit_Card_Balance'

## Scale the data
We scale data to ensure that all the features are on a similar scale, which is important for many machine learning algorithms. If the features are not on the same scale, some features will have a larger impact on the prediction than others, simply because of their larger magnitude, even if they are not necessarily more important.

For example, if one feature is measured in millions and another in tens or hundreds, the feature measured in millions will have a larger influence on the prediction, even if the other feature is more relevant. Scaling helps to avoid this issue by bringing all the features to a similar scale, so that they can be given equal weight in the prediction process.

In [12]:
#Preprocessing
# create a scaler object
scaler = StandardScaler()

# scale the numerical features
df[features] = scaler.fit_transform(df[features])

## MODEL EVALUATION

In [13]:
# Decision tree regression
tree_reg = DecisionTreeRegressor(random_state=42)
tree_reg.fit(df[features], df[target])

# make predictions on the entire dataset
y_pred_tree = tree_reg.predict(df[features])

# Evaluate the model
print("Decision Tree Regression:")
print("R2 Score:", r2_score(df[target], y_pred_tree))

Decision Tree Regression:
R2 Score: 1.0


This is an over-fitting problem; There are several things we can try to improve the performance of a model:

Feature selection: We can try selecting a subset of the most important features that have the highest correlation with the target variable.

Feature engineering: We can create new features from the existing ones that may better capture the relationship with the target variable. For example, you can create polynomial features, interaction terms, or apply other transformations to the existing features.

Hyperparameter tuning: We can try different hyperparameters for your regression model to optimize its performance. For example, you can try different values for the regularization parameter, the learning rate, or the number of trees in the random forest model.

Try a different algorithm: Sometimes, a different regression algorithm may work better for your dataset. In this case we will try Linear Regression and Random Forest Regression.

Increase the dataset size: Collecting more data can help improve the performance of your model, especially if the current dataset is small.

Address outliers: Outliers in the data can have a large impact on the performance of the model. We can try removing them or transforming them to reduce their impact.

In [14]:
# Linear regression
## create a linear regression model
lin_reg = LinearRegression()

# train the model on the training data
lin_reg.fit(df[features], df[target])

# make predictions on the entire dataset
y_pred_lin = lin_reg.predict(df[features])

# Evaluate the model
print("Linear Regression:")
print("R2 Score:", r2_score(df[target], y_pred_lin))

Linear Regression:
R2 Score: 7.69848906676529e-05


In [15]:
# Random forest regression
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(df.drop(target, axis=1), df[target])
y_pred_forest = forest_reg.predict(df.drop(target, axis=1))


# Evaluate the model
print("Random Forest Regression:")
print("R2 Score:", r2_score(df[target], y_pred_forest))

Random Forest Regression:
R2 Score: 0.8554950733467367


Different model options were used to evaluate the performance of different algorithms and choose the best one that gives the most accurate predictions. 

Each model has its own strengths and weaknesses, and the choice of the best model often depends on the specific problem and the characteristics of the dataset. 

By comparing the performance of different models, we can select the one that is most suitable for the problem at hand.

In this scenarion, the most performing model is Random Forest Regression as it has a lower R2 Score.

In [16]:
y_pred_forest

array([ 58601.63, 119632.16,  96722.02, ...,  51950.17,  28440.28,
       106065.87])

In [17]:
# create a dataframe with the predictions
df1['Credit_Card_Spending'] = pd.DataFrame(y_pred_forest)

# Add predicted values to original dataframe
#df['Predicted_Spending'] = y_pred

In [18]:
df1.isnull().sum()

Name                    0
Gender                  0
Phone_Number            0
Email                   0
Date_of_birth           0
Occupation              0
Income                  0
Credit_Score            0
Credit_Limit            0
Credit_Card_Balance     0
Credit_Card_Type        0
Credit_Card_Spending    0
dtype: int64

In [19]:
df1.head(5)

Unnamed: 0,Name,Gender,Phone_Number,Email,Date_of_birth,Occupation,Income,Credit_Score,Credit_Limit,Credit_Card_Balance,Credit_Card_Type,Credit_Card_Spending
0,Maxwell Dwyer,Male,7-372-875-5034,Maxwell_Dwyer3033@urn0m.center,1/23/1994,IT Support Staff,75563674.03,744,181182,46074,MasterCard,58601.63
1,Trisha Redden,Female,4-683-143-6436,Trisha_Redden4518@3wbkp.zone,02/02/1996,Retail Trainee,15546374.91,301,62491,146166,VisaGold,119632.16
2,Chuck Watson,Male,2-112-046-5848,Chuck_Watson4904@urn0m.com,04/06/1982,Steward,46203873.24,835,490051,78059,VisaPlatinum,96722.02
3,Meredith Bryant,Female,8-742-122-4222,Meredith_Bryant1962@xqj6f.app,04/04/1971,Doctor,56957946.95,331,322623,174758,VisaInfinite,157589.05
4,Brad Poulton,Male,8-381-385-4144,Brad_Poulton3135@ag5wi.design,12/08/1973,Stockbroker,69927678.15,626,232465,63160,MasterCard,82754.83


In [20]:
# categorize the predicted spending into risk categories
risk_thresh = np.quantile(df1['Credit_Card_Spending'], [0.25, 0.75])
low_risk_thresh, high_risk_thresh = risk_thresh

df1['Risk_Category'] = np.where(df1['Credit_Card_Spending'] <= low_risk_thresh, 'Low Risk',
                               np.where(df1['Credit_Card_Spending'] > high_risk_thresh, 'High Risk', 'Medium Risk'))

This is categorizing the predicted spending into risk categories based on their distribution.

First, the code calculates the lower and upper quartiles of the predicted spending values using the np.quantile() function. These quartiles are then used to define the risk thresholds for categorizing the predicted spending values.

Next, the np.where() function is used to categorize the predicted spending values into three risk categories - 'Low Risk', 'Medium Risk', and 'High Risk'. If the predicted spending value is less than or equal to the lower risk threshold, it is categorized as 'Low Risk'. If it is greater than the higher risk threshold, it is categorized as 'High Risk'. Otherwise, it is categorized as 'Medium Risk'.

Overall, this is useful in assigning risk categories to the predicted spending values, which can help with further analysis and decision-making.

In [21]:
df1.head(5)

Unnamed: 0,Name,Gender,Phone_Number,Email,Date_of_birth,Occupation,Income,Credit_Score,Credit_Limit,Credit_Card_Balance,Credit_Card_Type,Credit_Card_Spending,Risk_Category
0,Maxwell Dwyer,Male,7-372-875-5034,Maxwell_Dwyer3033@urn0m.center,1/23/1994,IT Support Staff,75563674.03,744,181182,46074,MasterCard,58601.63,Low Risk
1,Trisha Redden,Female,4-683-143-6436,Trisha_Redden4518@3wbkp.zone,02/02/1996,Retail Trainee,15546374.91,301,62491,146166,VisaGold,119632.16,Medium Risk
2,Chuck Watson,Male,2-112-046-5848,Chuck_Watson4904@urn0m.com,04/06/1982,Steward,46203873.24,835,490051,78059,VisaPlatinum,96722.02,Medium Risk
3,Meredith Bryant,Female,8-742-122-4222,Meredith_Bryant1962@xqj6f.app,04/04/1971,Doctor,56957946.95,331,322623,174758,VisaInfinite,157589.05,High Risk
4,Brad Poulton,Male,8-381-385-4144,Brad_Poulton3135@ag5wi.design,12/08/1973,Stockbroker,69927678.15,626,232465,63160,MasterCard,82754.83,Medium Risk


We will create a new column called Credit_Rating in the given dataset based on the values in the Credit_Score column. 
The Credit_Rating column is a categorical variable that indicates the credit rating of the individual based on their Credit_Score. 
The credit ratings are defined as follows:

Very poor: 300 to 579

Fair: 580 to 669

Good: 670 to 739

Very good: 740 to 799

Excellent: 800 to 850

In [22]:
# create the Credit_Rating column
credit_ratings = []
for score in df1['Credit_Score']:
    if 300 <= score <= 579:
        credit_ratings.append('Very poor')
    elif 580 <= score <= 669:
        credit_ratings.append('Fair')
    elif 670 <= score <= 739:
        credit_ratings.append('Good')
    elif 740 <= score <= 799:
        credit_ratings.append('Very good')
    else:
        credit_ratings.append('Excellent')
        
df1['Credit_Rating'] = credit_ratings

In [23]:
df1.head(5)

Unnamed: 0,Name,Gender,Phone_Number,Email,Date_of_birth,Occupation,Income,Credit_Score,Credit_Limit,Credit_Card_Balance,Credit_Card_Type,Credit_Card_Spending,Risk_Category,Credit_Rating
0,Maxwell Dwyer,Male,7-372-875-5034,Maxwell_Dwyer3033@urn0m.center,1/23/1994,IT Support Staff,75563674.03,744,181182,46074,MasterCard,58601.63,Low Risk,Very good
1,Trisha Redden,Female,4-683-143-6436,Trisha_Redden4518@3wbkp.zone,02/02/1996,Retail Trainee,15546374.91,301,62491,146166,VisaGold,119632.16,Medium Risk,Very poor
2,Chuck Watson,Male,2-112-046-5848,Chuck_Watson4904@urn0m.com,04/06/1982,Steward,46203873.24,835,490051,78059,VisaPlatinum,96722.02,Medium Risk,Excellent
3,Meredith Bryant,Female,8-742-122-4222,Meredith_Bryant1962@xqj6f.app,04/04/1971,Doctor,56957946.95,331,322623,174758,VisaInfinite,157589.05,High Risk,Very poor
4,Brad Poulton,Male,8-381-385-4144,Brad_Poulton3135@ag5wi.design,12/08/1973,Stockbroker,69927678.15,626,232465,63160,MasterCard,82754.83,Medium Risk,Fair


In [24]:
#Export the dataframe to a csv file to build a dashboard.
df1.to_csv('real_credit_card_predictions.csv', index=False)

You can find the visualization on my Dashboard and the project proposal document details my thought process for this project. Feel free to reach out to me via LinkedIn if you have any questions or need additional clarification.

<a href="https://linkedin.com/in/akonusoh">LinkedIn Page</a>