## Imports

In [276]:
import math
import numpy as np
import pandas as pd
import requests
import sqlalchemy

from bs4 import BeautifulSoup


import os
from sklearn.preprocessing import  Imputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
import sklearn

## Loading Data

In [167]:
sql_data=pd.read_csv('data/database/credit_risk_features.csv').drop(columns='Unnamed: 0')
api_data=pd.read_csv('data/api/api_data.csv').drop(columns='Unnamed: 0')
train = pd.read_csv('data/database/train.csv').drop(columns='Unnamed: 0')
files_data = pd.read_csv('data_files.csv').drop(columns='Unnamed: 0')
api_data.columns = [x.upper() for x in api_data.columns]

In [170]:
total = pd.merge(sql_data, api_data, on='SK_ID_CURR')
total= pd.merge(total, files_data, on='SK_ID_CURR')
total2= pd.merge(total,train, on='SK_ID_CURR', how='inner')
X= total2.drop(columns='TARGET')
Y=train.set_index('SK_ID_CURR')

In [201]:
total.shape

(100000, 102)

## Data Prep (functions)

In [107]:
def drop_cols(df,cols):
    df=df.copy()
    df= df.set_index('SK_ID_CURR')
    return df.drop(columns=cols)

def fill_NaN(df):
    df = df.copy()
    df.loc[:,['BASEMENTAREA_AVG','OWN_CAR_AGE']] = df.loc[:,['BASEMENTAREA_AVG','OWN_CAR_AGE']].fillna()
    df['EMERGENCYSTATE_MODE'] = df['EMERGENCYSTATE_MODE'].fillna('No')
    return df

def convert_str_to_cat(df):

    df = df.copy()
    str_cols = df.dtypes[df.dtypes == object].index.tolist()
    for col_name in str_cols:
        df[col_name]  = df[col_name].astype('category')

    return df

def convert_cat_to_dummies(df):
    categorical_features = df.select_dtypes(include='category').columns
    return pd.get_dummies(df, columns=categorical_features, drop_first=False)

def transform_categorical(df):
    df = convert_str_to_cat(df)
    df = convert_cat_to_dummies(df)
    return df




In [277]:
class adapter(BaseEstimator, TransformerMixin):

    def __init__ (self, columns='all'):
        self.columns=columns
        
    def fit(self,X=None,y=None):
        return self
    
    def transform(self,X,y=None):
        X=X.copy()
        X=drop_cols(X,self.columns)
        #X=fill_NaN(X)
        X=transform_categorical(X)
        return X

## Model1

In [278]:
a=adapter(columns_drop)

In [279]:
columns_drop= ['FLAG_EMAIL']

X_train, X_test ,y_train, y_test = train_test_split(a.transform(X),Y, random_state=0)

In [288]:

pipeline = Pipeline([('fill_na', Imputer(strategy='median')),
                     ('model',LogisticRegression())])

In [284]:
pipeline.fit(X_train.fillna(0) ,y_train['TARGET'].values)

Pipeline(memory=None,
     steps=[('fill_na', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('model', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_i...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [285]:
y_pred=pipeline.predict_proba(X_test)

In [286]:
y_pred

array([[1. , 0. ],
       [0.9, 0.1],
       [0.9, 0.1],
       ...,
       [1. , 0. ],
       [1. , 0. ],
       [0.3, 0.7]])

In [287]:
score= roc_auc_score(y_test, y_pred[:,1])
score

0.915328707934843

# Final

In [289]:
pipeline.fit(a.transform(X).drop(columns=['NAME_INCOME_TYPE_Maternity leave', 'ORGANIZATION_TYPE_Industry: type 8']).sort_index(axis=1),Y['TARGET'].values)

Pipeline(memory=None,
     steps=[('fill_na', Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0)), ('model', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [290]:
target = pd.read_csv('data/database/target.csv').drop(columns='Unnamed: 0')
Xtarget = pd.merge(total,target, on='SK_ID_CURR', how='inner').drop(columns='TARGET')

In [291]:
X_final=a.transform(Xtarget).sort_index(axis=1)

In [292]:
X_final.shape

(25000, 211)

In [296]:
y_traget=pipeline.predict_proba(X_final)

In [297]:

export=pd.concat([X_final.reset_index()['SK_ID_CURR'], pd.Series(y_traget[:,1])],axis=1)
export.columns= ['SK_ID_CURR', 'TARGET']

In [299]:
export.to_csv('5_pred.csv',index=False)

In [298]:
export.TARGET.describe()

count    25000.000000
mean         0.089902
std          0.047210
min          0.000455
25%          0.054115
50%          0.079730
75%          0.118074
max          0.294113
Name: TARGET, dtype: float64

count    75000.000000
mean         0.079880
std          0.271109
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: TARGET, dtype: float64

In [225]:
[x for x in  X_train.fillna(0).columns  if x not in X_final.columns]

['NAME_INCOME_TYPE_Maternity leave', 'ORGANIZATION_TYPE_Industry: type 8']

SK_ID_CURR
100004   NaN
100012   NaN
100034   NaN
100043   NaN
100055   NaN
100058   NaN
100063   NaN
100088   NaN
100112   NaN
100126   NaN
100134   NaN
100153   NaN
100157   NaN
100161   NaN
100175   NaN
100178   NaN
100186   NaN
100188   NaN
100205   NaN
100208   NaN
100215   NaN
100217   NaN
100229   NaN
100242   NaN
100261   NaN
100272   NaN
100290   NaN
100350   NaN
100369   NaN
100371   NaN
          ..
455929   NaN
455931   NaN
455932   NaN
455941   NaN
455944   NaN
455946   NaN
455947   NaN
455979   NaN
456020   NaN
456042   NaN
456050   NaN
456075   NaN
456106   NaN
456112   NaN
456113   NaN
456150   NaN
456162   NaN
456172   NaN
456179   NaN
456184   NaN
456207   NaN
456210   NaN
456215   NaN
456227   NaN
456234   NaN
456236   NaN
456237   NaN
456242   NaN
456246   NaN
456252   NaN
Name: TARGET, Length: 25000, dtype: float64