In [None]:
import pandas as pd
import numpy as np
data = pd.read_csv('HappyCustomerBank.csv')
data.info()


In [60]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from sklearn.model_selection import train_test_split


import numpy as np
import datetime
import json

class DateTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, format="%d-%b-%y"):
        self.format = format
        self.date_now = datetime.datetime.now()
        super().__init__() 

    def transform(self, X):
        def date2float(date_str):
            """
            diff from *date_str* to today in years.
            """
            _date = datetime.datetime.strptime(date_str, self.format)
            if _date > self.date_now:
                _date = _date.replace(year = _date.year - 100)
            return (self.date_now - _date).days/365.0
        vdate2float = np.vectorize(date2float)            
       
        return vdate2float(X)

    def fit(self, x, y=None):
        return self
    
class CityInfoTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, json_path="city_pop.json"):
        self.json_path = json_path
        super().__init__() 

    def transform(self, X):
        with open(self.json_path) as json_file:
            city_dict = json.load(json_file)
        def get_info(city_name):
            city_info = city_dict.get(city_name)
            if city_info is None:
                city_info = city_dict.get('Mean_city')
            return city_info['population'] # jak dodac kolumny ze wsporzednymi
        vget_info = np.vectorize(get_info)
        return vget_info(X)

    def fit(self, x, y=None):
        return self
    
class CategoricalBinaryTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__() 

    def transform(self, X):
        for feature in X:
            one = sorted(X.get(feature).unique())[0]
            def code_one(x_value):
                if x_value == one:
                    return 1
                return 0
            vcode_one = np.vectorize(code_one)
            X[feature] = vcode_one(X[feature])
        return X

    def fit(self, x, y=None):
        return self
    

date_features = ['DOB', 'Lead_Creation_Date']
date_transformer = DateTransformer()
city_features = ['City']
city_transformer = CityInfoTransformer()
binary_features = ['Gender', 'Device_Type','Mobile_Verified','Filled_Form']
binary_transformer = CategoricalBinaryTransformer()


float64_features = data.select_dtypes(include=['float64']).columns
categorical_features = data.select_dtypes(include=['object']).drop(
    ['ID','Employer_Name', 'Salary_Account' ]+binary_features+date_features+city_features, axis=1).columns

# int64_features = data.select_dtypes(include=['int64']).drop(['LoggedIn', 'Disbursed'],axis=1).columns
# remove int64
# ValueError: No valid specification of the columns. Only a scalar, list or slice of all integers or all strings, or boolean mask is allowed
# int64_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())])

float_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])


categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('date', date_transformer, date_features),
        ('city', city_transformer, city_features),
        ('binary', binary_transformer, binary_features),
#         ('int64', int64_transformer, int64_transformer),
        ('flaot64', float_transformer, float64_features),
        ('categorical', categorical_transformer, categorical_features), 
    ])
data_0 = data.drop(['LoggedIn','Disbursed'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data_0, data["LoggedIn"], stratify=data["LoggedIn"], test_size=0.2)




In [None]:
class ParamLogisticRegression(LogisticRegression):
    def __init__(self,param=None, **kwargs):
        
        self.param = param
        super().__init__(**kwargs)     
        
    def predict(self,data):
        p_probability = super(ParamLogisticRegression,self).predict_proba(data)
        # (p_probability[:,1]>self.param).astype(int)
        return np.asarray(list(map(lambda x :  int(x[0]<self.param), p_probability)))

In [69]:


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix

classifiers = [
    (DecisionTreeClassifier(),{'classifier_min_samples_leaf':range(1,50,5)})
    (RandomForestClassifier(),{'classifier_n_estimators':[10,50,100], 'classifier_min_samples_leaf':[1,25,40]})
    (ParamLogisticRegression(), {'classifier_param':[0.1, 0.2, 0.3, 0.5]})
    ]


for classifier, parma_grid in classifiers:
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', classifier)])
    gs = GridSearchCV(pipe, param_grid, cv=10)

    gs.fit(X_train, y_train)
    y_pred = gs.predict(X_test)
    print(classifier)
    print("model score: %.3f" % gs.score(X_test, y_test))
    print("model recall: %.3f" % recall_score(y_test, y_pred))
    print("model precision: %.3f" % precision_score(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred).ravel())

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')
model score: 0.943
model recall: 0.084
model precision: 0.076
[14490   462   414    38]




RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)
model score: 0.970
model recall: 0.004
model precision: 0.133
[14939    13   450     2]




LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
model score: 0.971
model recall: 0.000
model precision: 0.000
[14952     0   452     0]


  'precision', 'predicted', average, warn_for)


In [None]:
# Author: Pedro Morales <part.morales@gmail.com>
#
# License: BSD 3 clause

import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV

np.random.seed(0)

# Read data from Titanic dataset.
titanic_url = ('https://raw.githubusercontent.com/amueller/'
               'scipy-2017-sklearn/091d371/notebooks/datasets/titanic3.csv')
data = pd.read_csv(titanic_url)

# We will train our classifier with the following features:
# Numeric Features:
# - age: float.
# - fare: float.
# Categorical Features:
# - embarked: categories encoded as strings {'C', 'S', 'Q'}.
# - sex: categories encoded as strings {'female', 'male'}.
# - pclass: ordinal integers {1, 2, 3}.

# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['age', 'fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['embarked', 'sex', 'pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression(solver='lbfgs'))])

X = data.drop('survived', axis=1)
y = data['survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
print("model score: %.3f" % clf.score(X_test, y_test))

In [88]:
import json
_dict ={}
with open('city.json') as json_file:
    data = json.load(json_file)
for row in data['data']:
    split = row['location'].split(',')
    E = float(split[1])
    N = float(split[0].split('=')[1])
    _dict[row['asciiname']] = {'population':row['population'], 'coordinates_N':N,'coordinates_E':E}
_dict['Mean_city'] = {'population':np.mean([x['population'] for x in _dict.values()]), 'coordinates_N':
np.mean([x['coordinates_N'] for x in _dict.values()]),'coordinates_E':
np.mean([x['coordinates_E'] for x in _dict.values()])}

json.dump(_dict, open('city_pop.json', 'w'))


In [29]:
corr = data.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,Monthly_Income,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Var5,Loan_Amount_Submitted,Loan_Tenure_Submitted,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Var4,LoggedIn,Disbursed
Monthly_Income,1.0,0.00265386,-0.00468359,0.136196,0.0202949,0.0381523,-0.00170623,-0.0150832,0.0205056,0.0354957,0.00597488,-0.000175102,0.000125049
Loan_Amount_Applied,0.00265386,1.0,0.501516,0.0617917,0.32027,0.412195,0.13795,-0.176515,0.192117,0.239559,-0.00354106,0.0374942,0.0329692
Loan_Tenure_Applied,-0.00468359,0.501516,1.0,0.0518778,0.242887,-0.0389489,0.138227,-0.140012,-0.110964,-0.199499,-0.108099,0.0460606,0.0385462
Existing_EMI,0.136196,0.0617917,0.0518778,1.0,0.0159389,0.0116318,0.00146217,-0.0161865,-0.00317708,0.007366,-0.0292797,0.0137077,0.0144755
Var5,0.0202949,0.32027,0.242887,0.0159389,1.0,0.294619,-0.0817352,-0.538707,0.206929,0.27765,0.355747,0.141525,0.117313
Loan_Amount_Submitted,0.0381523,0.412195,-0.0389489,0.0116318,0.294619,1.0,0.380813,-0.320679,0.761394,0.92095,-0.0467436,0.0428695,0.0431333
Loan_Tenure_Submitted,-0.00170623,0.13795,0.138227,0.00146217,-0.0817352,0.380813,1.0,-0.0987513,0.30841,0.0944575,0.0351068,0.0069532,-0.000855524
Interest_Rate,-0.0150832,-0.176515,-0.140012,-0.0161865,-0.538707,-0.320679,-0.0987513,1.0,-0.110952,-0.242533,0.0568499,-0.111476,-0.0920129
Processing_Fee,0.0205056,0.192117,-0.110964,-0.00317708,0.206929,0.761394,0.30841,-0.110952,1.0,0.720493,0.0977931,0.0393984,0.0329787
EMI_Loan_Submitted,0.0354957,0.239559,-0.199499,0.007366,0.27765,0.92095,0.0944575,-0.242533,0.720493,1.0,0.116679,0.042187,0.0473291


In [23]:
['a']+['b']

['a', 'b']

In [62]:
?RandomForestClassifier