In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import tensorflow as tf
from sklearn import svm,metrics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import NearMiss
from imblearn.over_sampling import SMOTE 
import seaborn as sn
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style="darkgrid")

def InputAssembleNormalize(df,colunas):
  X = df[colunas]
  for x in colunas:
    X[x]=(X[x]-X[x].min())/(X[x].max()-X[x].min())
  return X


df_crm1 = pd.read_csv(r'CRM_v6.csv',decimal=",")
df_cred = pd.read_csv(r'Credito_v6.csv',decimal=",")
df_imov = pd.read_csv(r'Imovel_v6.csv',decimal=",")

#Acertando a primary key e target 
#Forced cleaning wrangling

df_crm1 = df_crm1.rename(columns={'Data Hora Aprovação Comite': 'Aprovação_Comite'})
df_cred = df_cred.rename(columns={'Tem vencido BACEN maior que 50% valor empréstimo': 'VencidoBacen50'})
df_cred = df_cred.rename(columns={'Total a vencer BACEN maior que 50% empréstimo ?': 'VencerBacen50'})
df_crm1 = df_crm1.rename(columns={'Nome': 'Proposta'})
pd.set_option("display.max_rows", 999)


df_cred['VencidoBacen50'].fillna('Não',inplace=True)
df_cred['VencerBacen50'].fillna('Não',inplace=True)


df_crm1.loc[:,['Aprovação_Comite']] = (~df_crm1['Aprovação_Comite'].isna()).astype(int)
df_cred.loc[:,['VencidoBacen50']] = (df_cred['VencidoBacen50'] == 'Sim').astype(int)
df_cred.loc[:,['VencerBacen50']] = (df_cred['VencerBacen50'] == 'Sim').astype(int)

#Escolhe colunas que são importantes de cada arquivo

#Talvez 'Tem vencido BACEN maior que 50% valor empréstimo','Total a vencer BACEN maior que 50% empréstimo ?',
#Score (PH3A)
colunasCRM = ['Aprovação_Comite','Proposta','Status','Valor Solicitado',
              'Quantidade de Dívidas', 'Valor Total Dívidas Real',
              'Bacen Valor à Vencer','Quantidade de Imóveis']

colunasCred = ['Proposta','Score Serasa','Score Neurolake','Total Bens e Direitos','Score (PH3A)',
               'VencidoBacen50','VencerBacen50']

colunasImov = ['Proposta','Valor Médio de Venda Forçada']
#'Valor Total Dívidas Real',

colunas = ['Valor Médio de Venda Forçada','Score Serasa','Valor Solicitado','VencerBacen50',
           'Quantidade de Dívidas', 'Valor Total Dívidas Real', 'Score (PH3A)','Score Neurolake',
           'Bacen Valor à Vencer','Total Bens e Direitos','VencidoBacen50']

colunasSave = ['Score Neurolake','Score Serasa','Valor Solicitado','VencerBacen50',
           'Quantidade de Dívidas', 'Valor Total Dívidas Real', 'Valor Médio de Venda Forçada','Score (PH3A)',
           'Bacen Valor à Vencer','Total Bens e Direitos','VencidoBacen50','Aprovação_Comite']


df_crm1 = df_crm1[colunasCRM]
df_cred = df_cred[colunasCred]
df_imov = df_imov[colunasImov]

#transforma Rascunho em numerio para não sair do groupby

df_crm1.loc[:,['Status']] = (df_crm1['Status'] == 'Rascunho').astype(int)

#concatena arquivos CRM e Crédito e imóvel

lista = [df_crm1,df_cred,df_imov]

df_crm2 = pd.concat(lista)

#Agrupa pela Proposta considerando a média de score com os proponentes

df_crm = df_crm2.groupby('Proposta').sum()

df_crm[colunasSave].to_csv('DF_CRMv6',index=False)
df_crm.fillna(0,inplace=True)
print(df_crm.describe().T)

X_normal = InputAssembleNormalize(df_crm,colunas)

X_train = X_normal
y_train = df_crm['Aprovação_Comite']
X_trainA = X_train.copy()
y_trainA = y_train.copy()

nr = NearMiss()
X_train, y_train = nr.fit_sample(X_train, y_train)



plot_size = plt.rcParams["figure.figsize"]
plot_size [0] = 12
plot_size [1] = 9
plt.rcParams["figure.figsize"] = plot_size

# Matriz de correlação

corrMatrix = df_crm.corr()
corrMatrix.sort_values('Aprovação_Comite',inplace=True, ascending=False)
colu = corrMatrix.index

sn.set(font_scale=0.75)
sn.heatmap(corrMatrix[colu], annot=True,cmap='Oranges')
plt.savefig('CrossFeatures.png', dpi=None, facecolor='w', edgecolor='b',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches='tight', pad_inches=0.1,
        frameon=None)


plt.show()



In [1]:
# usando xgboost  Regressor para variáveis candidatas
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.3f}'.format
# dataset 
X, y = X_train,y_train
# define modelo
model = XGBRegressor()
# fit modelo
print(X)
print(y)
model.fit(X, y)
# busca a  importancia
importance = model.feature_importances_


# resume a importância

feature_importances = pd.DataFrame(importance,index = colunas,
                                    columns=['Importance']).sort_values('Importance')

# plota

sn.set(font_scale=0.75)
sn.heatmap(feature_importances.sort_values('Importance',ascending=False), annot=True,cmap='Oranges')
plt.savefig('XG_Importance.png', dpi=None, facecolor='w', edgecolor='b',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches='tight', pad_inches=1,
        frameon=None)
plt.show()

NameError: name 'X_train' is not defined

In [2]:
## Verifica importância pelo RandomForest
from sklearn.ensemble import RandomForestClassifier 
from mpl_toolkits.mplot3d import Axes3D

 
rf = RandomForestClassifier() 

rf.fit(X_train, y_train) 


feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_trainA.columns,
                                    columns=['Importance']).sort_values('Importance')


# plota
sn.set(font_scale=0.75)
sn.heatmap(feature_importances.sort_values('Importance',ascending=False), annot=True,cmap='Oranges')
plt.savefig('RF_Importance.png', dpi=None, facecolor='w', edgecolor='b',
        orientation='portrait', papertype=None, format=None,
        transparent=False, bbox_inches='tight', pad_inches=0.1,
        frameon=None)
plt.show()

'''cols = feature_importances.index

df_cluster = pd.DataFrame(X_train, columns = X_trainA.columns)
df_cluster['Aprova'] = y_train

print(df_cluster.columns)
t = len(cols)-2
for i in range(t):
  for j in range(t):
    for z in range(t): 

      if i==j:
        if i==z:
          continue
      fig = plt.figure()
      ax = Axes3D(fig) 
      ax.scatter(df_cluster[cols[-i]], df_cluster[cols[-j]], df_cluster[cols[-z]], c=df_cluster['Aprova'],cmap='viridis')
      ax.set_xlabel(cols[-i], fontsize=15, rotation=150)
      ax.set_ylabel(cols[-j], fontsize=15)
      ax.set_zlabel(cols[-z], fontsize=15, rotation=60)
      plt.show()'''
  


NameError: name 'X_train' is not defined