# 1. Cleaning

In [315]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler 

In [316]:
sc = StandardScaler()
df = pd.read_csv('Credit_Card.csv')
label_df = pd.read_csv('Credit_card_label.csv')
cc_df = pd.merge(df, label_df, on='Ind_ID')
cc_df.dtypes # if its not an int or float its categorical (probably)

Ind_ID               int64
GENDER              object
Car_Owner           object
Propert_Owner       object
CHILDREN             int64
Annual_income      float64
Type_Income         object
EDUCATION           object
Marital_status      object
Housing_type        object
Birthday_count     float64
Employed_days        int64
Mobile_phone         int64
Work_Phone           int64
Phone                int64
EMAIL_ID             int64
Type_Occupation     object
Family_Members       int64
label                int64
dtype: object

In [317]:
empty_count = cc_df.isnull().sum()
empty_columns = empty_count[empty_count > 0]
empty_columns
# dropping occupation because too many values are empty
cc_df = cc_df.drop('Type_Occupation', axis='columns')

In [318]:
# i want to see how many unique values are in the categorical columns
for col in cc_df:
    if cc_df[col].dtype == object:
        print(col, cc_df[col].nunique())

GENDER 2
Car_Owner 2
Propert_Owner 2
Type_Income 4
EDUCATION 5
Marital_status 5
Housing_type 6


In [319]:
# give all NaN values an appropriate replacement
def fill_nan(df: pd.DataFrame, col: str, tendency: str) -> pd.Series:
    '''
    takes in a dataframe, column, and what value to use in place of NaN values
    '''
    if tendency == 'mean':
        df[col] = df[col].fillna(df[col].mean())
    elif tendency == 'median':
        df[col] = df[col].fillna(df[col].median())
    elif tendency == 'mode':
        df[col] = df[col].fillna(df[col].mode())

# encode all categorical values
def encode_cat(df: pd.DataFrame, col: str) -> pd.Series:
    '''
    takes in a dataframe and a categorical column and loops through all unique values. 
    if the value hasnt already been encoded, it uses the current value as a key and assigns a numerical value to it
    '''
    val_map = {}
    code = 0
    for value in df[col].unique():
        if value not in val_map:
            val_map[value] = code
            code+=1
    df[col] = df[col].map(val_map)

fill_nan(cc_df, 'GENDER', 'mode')
fill_nan(cc_df, 'Annual_income', 'median')
fill_nan(cc_df, 'Birthday_count', 'mean')
encode_cat(cc_df, 'GENDER')
encode_cat(cc_df, 'Car_Owner')
encode_cat(cc_df, 'Propert_Owner')
encode_cat(cc_df, 'Type_Income')
encode_cat(cc_df, 'EDUCATION')
encode_cat(cc_df, 'Marital_status')
encode_cat(cc_df, 'Housing_type')


# 2. Univariate Linear Regression 
using Annual_Income to predict if a credit card application is approved

In [320]:
def linear_regression(df: pd.DataFrame, X_col: str, y_col: str) -> float:
    
    X = np.array(df[X_col]).reshape(-1,1)
    y = np.array(df[y_col]).reshape(-1,1)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    linear_reg_model = LinearRegression()
    linear_reg_model.fit(X_train, y_train)
    y_pred = linear_reg_model.predict(X_test)
    
    r2 = linear_reg_model.score(X_test, y_test)
    return r2
linear_regression(cc_df, 'Annual_income', 'label')

-0.01587415490612476

A score of -0.016 means the model performed very poorly. The problem stems from linear regression being designed for linear, continuous values, and in this context we're trying to predict a nonlinear, discrete outcome. So, linear regression isn't suitable for this dataset.  

# 3. KNN

In [321]:
def optimal_knn() -> float:
    '''
    we can find the optimal k value by calculating the accuracy score for each value of k 
    in a predetermined range and returning the value that gives us the best score
    we can then use that value to perform KNN 
    '''
    X = cc_df.drop('label', axis=1)
    y = cc_df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    high_score = 0
    opt_k = 0
    for k in range(1, 20):
        knn = KNeighborsClassifier(n_neighbors = k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score = knn.score(X_test, y_test)
        if score > high_score:
            high_score = score
            opt_k = k 
    return high_score, opt_k
    
optimal_knn()

(0.9075268817204301, 2)

The KNN model performed extremely well with an accuracy score of 0.91. 

# 4. Logistic Regression

In [322]:
def logistic_regression() -> float:
    X = cc_df.drop('label', axis=1)
    y = cc_df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    regression = LogisticRegression(random_state=42).fit(X_train, y_train)
    r2=regression.score(X_test, y_test)
    return r2
    
logistic_regression()

0.9075268817204301

The logistic regression model had the same outcome as KNN with a score of 0.91. both models made mostly accurate predictions.

# 5. Normalization

In [323]:
def linear_regression(df: pd.DataFrame, X_col: str, y_col: str) -> float:
    
    X = np.array(df[X_col]).reshape(-1,1)
    y = np.array(df[y_col]).reshape(-1,1)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    X_train = sc.fit_transform(X_train) 
    X_test = sc.fit_transform(X_test)
    
    linear_reg_model = LinearRegression()
    linear_reg_model.fit(X_train, y_train)
    y_pred = linear_reg_model.predict(X_test)
    
    r2 = linear_reg_model.score(X_test, y_test)
    return r2

In [324]:
def optimal_knn() -> float:
    X = cc_df.drop('label', axis=1)
    y = cc_df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    X_train = sc.fit_transform(X_train) 
    X_test = sc.fit_transform(X_test)
    
    high_score = 0
    opt_k = 0
    for k in range(1, 20):
        knn = KNeighborsClassifier(n_neighbors = k)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        score = knn.score(X_test, y_test)
        if score > high_score:
            high_score = score
            opt_k = k 
    return high_score, opt_k

In [325]:
def logistic_regression() -> float:
    X = cc_df.drop('label', axis=1)
    y = cc_df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    X_train = sc.fit_transform(X_train) 
    X_test = sc.fit_transform(X_test)
    
    regression = LogisticRegression(random_state=42).fit(X_train, y_train)
    r2=regression.score(X_test, y_test)
    return r2

In [326]:
linear_regression(cc_df, 'Annual_income', 'label'), optimal_knn(), logistic_regression()

(-0.01704379390288069, (0.9096774193548387, 2), 0.9075268817204301)

After normalizing the data, the following things happened:
 - linear regression had slightly lower accuracy

 - KNN had slightly higher accuracy


 - logistic regression had the same accuracy score

I think accuracy is the best measure of performance for this dataset because our goal is to *accurately* predict who had their application approved. Precision and recall are less important here because the stakes are low for false predictions.
