In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
from scipy.stats import t
from sklearn.decomposition import PCA, IncrementalPCA, TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import RidgeClassifierCV, LogisticRegression
from sklearn.preprocessing import Imputer
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

# Lets try out some stuff on a super small dataframe

In [None]:
# intialise data of lists. 
data = {'Name':['Tom', 'nick', 'krish', 'nan'], 'Age':[np.nan, 21, 19, 22], 'Sex':['M', 'M', 'F', 'M'], 'Footwear':['Sandals', 'Shoes', 'Sandals', 'Shoes']} 
  
# Create DataFrame 
df_sample = pd.DataFrame(data) 
  
# Print the output. 
df_sample

### get rid of nulls

In [None]:
imp = SimpleImputer(strategy='most_frequent')

In [None]:
imp.fit(df_sample)

In [None]:
df_imp = imp.transform(df_sample)

In [None]:
df_imp

### split the dataframe into X and y

In [None]:
X_df = df_sample['Footwear']
X_df

In [None]:
y_df = df_sample.drop(['Footwear'], axis=1)
y_df

In [None]:
# Get dummies
df_sample_dummies = pd.get_dummies(y_df, prefix_sep='_', drop_first=True)

# X head
print(df_sample_dummies)

In [None]:
# Get sparse dummies
df_sample_sparse_dummies = pd.get_dummies(df_sample, prefix_sep='_', drop_first=True, sparse=True)

# X head
print(df_sample_sparse_dummies)

In [None]:
df_sample_dummies.info()

In [None]:
df_sample_sparse_dummies.info()

In [None]:
from sklearn.cluster import KMeans
# dictionary to fill with the single square errors
sse = {}
# Fit KMeans and calculate SSE for each k
for k in range(1, 5):
  
    # Initialize KMeans with k clusters
    kmeans = KMeans(n_clusters=k, random_state=1)
    
    # Fit KMeans on the normalized dataset
    kmeans.fit(df_sample_sparse_dummies)
    
    # Assign sum of squared distances to k element of dictionary
    sse[k] = kmeans.inertia_ 
# Add the plot title "The Elbow Method"
plt.title('The Elbow Method')

# Add X-axis label "k"
plt.xlabel('k')

# Add Y-axis label "SSE"
plt.ylabel('SSE')

# Plot SSE values for each key in the dictionary
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
plt.show()

In [None]:
pca = PCA(n_components=2)
y_pca = pca.fit_transform(df_sample_sparse_dummies)

In [None]:
y_pca

# Let's try it on the big dataset

In [None]:
df = pd.read_csv('Data/Loan_Storied.csv',low_memory=False,index_col=0, 
                parse_dates=['issue_d', 'earliest_cr_line', 'last_pymnt_d', 'next_pymnt_d', 'last_credit_pull_d', 
                               'debt_settlement_flag_date', 'settlement_date','sec_app_earliest_cr_line'])

In [None]:
df.head()

In [None]:
df.info()

We need to get rid of columns that contain information that happens after a loan has been given out, such as hardships.  This is really an important step and it also helps us downsize our dataset a little bit.  

In [None]:
drop_list = ['collection_recovery_fee', 'debt_settlement_flag','debt_settlement_flag_date','deferral_term','delinq_amnt','hardship_amount','hardship_dpd', 'hardship_end_date', 'hardship_flag','hardship_last_payment_amount','hardship_length', 'hardship_loan_status','hardship_payoff_balance_amount', 'hardship_reason', 'hardship_start_date', 'hardship_status', 'hardship_type', 'last_pymnt_amnt','last_pymnt_d','next_pymnt_d','orig_projected_additional_accrued_interest','out_prncp','out_prncp_inv','payment_plan_start_date','pymnt_plan','recoveries','settlement_amount','settlement_date','settlement_percentage','settlement_status','settlement_term','total_pymnt','total_pymnt_inv','total_rec_int','total_rec_late_fee','total_rec_prncp','desc','zip_code','title','emp_title']
df = df.drop(drop_list, axis=1)

### And now lets change the dates to floats by subtracting to or from the issue date

In [None]:
df['issue_d_minus_earliest_cr_line'] = (df['issue_d']-df['earliest_cr_line']).dt.days
df['last_credit_pull_d_minus_issue_d'] = (df['last_credit_pull_d']-df['issue_d']).dt.days
df['issue_d_minus_sec_app_earliest_cr_line'] = (df['issue_d']-df['sec_app_earliest_cr_line']).dt.days

### Drop the date fields

In [None]:
dt_drop_list = ['issue_d','earliest_cr_line','last_credit_pull_d','sec_app_earliest_cr_line']
df = df.drop(dt_drop_list, axis=1)
df.head()

### Convert Default to Charged Off since it is the same outcome 

In [None]:
df['loan_status'] = np.where(df['loan_status'] == 'Default','Charged Off',df['loan_status'])
df['loan_status'].unique()

# Get rid of null values

In [None]:
df.isnull().sum()

In [None]:
for col in df:
    print(df[col].value_counts(dropna=False))

In [None]:
obj_list = sorted(list(df.select_dtypes(include=['object']).columns))
for i in range(len(obj_list)):
    print(df[obj_list[i]].value_counts(dropna=False))

In [None]:
imp = SimpleImputer(strategy='most_frequent')

In [None]:
df_imp = imp.fit_transform(df)

In [None]:
#df_imp = imp.transform(df)

In [None]:
out_csv = 'Data/Loan_noNull.csv'
df_imp.to_csv(out_csv)

### Let's use a Heatmap and see if we can get rid of correlated data

Here we are creating a quick function that will create a heatmap. This heat map will show correlation between columns. 

In [None]:
def heatMap(df, mirror=False):

   # Create Correlation df
   corr = df.corr()
   # Plot figsize
   fig, ax = plt.subplots(figsize=(100, 100))
   # Generate Color Map
   colormap = sns.diverging_palette(220, 10, as_cmap=True)
   
   if mirror == True:
      #Generate Heat Map, allow annotations and place floats in map
      sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f")
      #Apply xticks
      plt.xticks(range(len(corr.columns)), corr.columns);
      #Apply yticks
      plt.yticks(range(len(corr.columns)), corr.columns)
      #show plot

   else:
      # Drop self-correlations
      dropSelf = np.zeros_like(corr)
      dropSelf[np.triu_indices_from(dropSelf)] = True
      # Generate Color Map
      colormap = sns.diverging_palette(220, 10, as_cmap=True)
      # Generate Heat Map, allow annotations and place floats in map
      sns.heatmap(corr, cmap=colormap, annot=True, fmt=".2f", mask=dropSelf)
      # Apply xticks
      plt.xticks(range(len(corr.columns)), corr.columns);
      # Apply yticks
      plt.yticks(range(len(corr.columns)), corr.columns)
   # show plot
   plt.show()

In [None]:
heatMap(df)

In [None]:
corr_drop_list = ['funded_amnt','funded_amnt_inv','installment','open_acc','tot_cur_bal','total_bal_il','num_rev_tl_bal_gt_0','mo_sin_old_rev_tl_op']
df = df.drop(corr_drop_list, axis=1)

In [None]:
heatMap(df)

In [None]:
sorted(df)

In [None]:
df.info()

# These are the object datatype variables that will need to be changed to numeric variables.  

In [None]:
obj_list = sorted(list(df.select_dtypes(include=['object']).columns))
for i in range(len(obj_list)):
    print(obj_list[i])
    print(len(df[obj_list[i]].unique()))
    print(df[obj_list[i]].unique())
    print("--------------------------")

In [None]:
df = pd.get_dummies(df, prefix_sep='_', drop_first=True)

In [None]:
df.info()

In [None]:
df.dtypes

# Split the dataset into X and y datasets

In [None]:
df_X = df['loan_status_Fully Paid']
df_X.head(20)

In [None]:
y_drop_list = ['loan_status_Fully Paid']
df_y = df.drop(y_drop_list, axis=1)
df_y.head()

### Let's split it into Testing and Training datasets
We want to split the data two times.  We want a dataset for training the models and selecting hyperparameters, then another dataset for testing which can be used to compare the models against one another, and then a final dataset for testing our slected model.  Since we need three groups of data we will split the data twice.  

In [None]:
X_int_train, X_final_test, y_int_train, y_final_test =train_test_split(df_X, df_y, test_size=0.2,random_state=1)

In [None]:
X_train, X_model_test, y_train, y_model_test =train_test_split(X_int_train, y_int_train, test_size=0.2,random_state=1)

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
#param_grid = {'n_neighbors': np.arange(1, 50)}
#knn = KNeighborsClassifier()
#knn_cv = GridSearchCV(knn, param_grid, cv=5)
#knn_cv.fit(X, y)
#knn_cv.best_params_
#knn_cv.best_score_

In [None]:
X_train = X_train.to_numpy().reshape(-1,1)

In [None]:
y_train = y_train.to_numpy()

In [None]:
y_train[1]

In [None]:
clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X_train, y_train)

In [None]:
from sklearn import linear_model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
knn = KNeighborsClassifier(n_neighbors=6)
knn.fit(iris['data'], iris['target'])
#see the example above about param grid - it is using grid search to determine the best number of neighbors

In [None]:
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)

In [None]:
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [None]:
from catboost import CatBoostClassifier, Pool

train_data = Pool(data=[[1, 4, 5, 6],
                        [4, 5, 6, 7],
                        [30, 40, 50, 60]],
                  label=[1, 1, -1],
                  weight=[0.1, 0.2, 0.3])

model = CatBoostClassifier(iterations=10)

model.fit(train_data)
preds_class = model.predict(train_data)

In [None]:
In [1]: from sklearn.metrics import roc_auc_score
In [2]: logreg = LogisticRegression()
In [3]: X_train, X_test, y_train, y_test = train_test_split(X, y,
...: test_size=0.4, random_state=42)
In [4]: logreg.fit(X_train, y_train)
In [5]: y_pred_prob = logreg.predict_proba(X_test)[:,1]
In [6]: roc_auc_score(y_test, y_pred_prob)
Out[6]: 0.997466216216

In [None]:
In [7]: from sklearn.model_selection import cross_val_score
In [8]: cv_scores = cross_val_score(logreg, X, y, cv=5,
...: scoring='roc_auc')
In [9]: print(cv_scores)
[ 0.99673203 0.99183007 0.99583796 1. 0.96140652]

In [None]:
pca = PCA()
y_pca = pca.fit_transform(df_y_with_dummies)

In [None]:
inc_pca = IncrementalPCA()
#inc_pca = IncrementalPCA(n_components=100,copy=False,batch_size=10)
# y_pca = inc_pca.fit_transform(df_y_with_dummies)
inc_pca.fit(df_y_with_dummies)
y_pca = inc_pca.transform(df_y_with_dummies)

In [None]:
tsvd = TruncatedSVD()
y_tsvd = tsvd.fit_transform(df_y_with_dummies)