In [1]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing, impute
from sklearn.experimental import enable_iterative_imputer
import pandas as pd

In [2]:
labelencoder = LabelEncoder()

df = pd.read_csv("../data/train_merged_clean_small_data.csv")
columns = df[['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']]


In [3]:

labelEncoder = preprocessing.LabelEncoder()
columns = columns.apply(labelencoder.fit_transform)
columns

Unnamed: 0,B_30,B_38,D_114,D_116,D_117,D_120,D_126,D_63,D_64,D_68
0,1,6,0,0,0,0,2,1,2,6
1,0,1,1,0,4,0,2,1,1,4
2,0,1,1,0,3,0,2,1,1,5
3,0,0,1,0,2,0,2,1,1,6
4,0,2,0,0,4,0,1,2,3,3
...,...,...,...,...,...,...,...,...,...,...
724601,0,2,1,0,4,0,2,1,1,6
724602,1,3,1,0,2,0,2,1,1,4
724603,0,4,1,0,4,0,2,1,1,6
724604,1,2,1,0,3,1,2,1,1,6


In [4]:
mat = preprocessing.OneHotEncoder()
mat.fit(columns)
one_hot_labels = mat.transform(columns).toarray()
one_hot_labels

array([[0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 0., 1.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [5]:
cat_col_names = mat.get_feature_names_out(list(columns.columns))
cat_col_names

array(['B_30_0', 'B_30_1', 'B_30_2', 'B_38_0', 'B_38_1', 'B_38_2',
       'B_38_3', 'B_38_4', 'B_38_5', 'B_38_6', 'D_114_0', 'D_114_1',
       'D_116_0', 'D_116_1', 'D_117_0', 'D_117_1', 'D_117_2', 'D_117_3',
       'D_117_4', 'D_117_5', 'D_117_6', 'D_120_0', 'D_120_1', 'D_126_0',
       'D_126_1', 'D_126_2', 'D_63_0', 'D_63_1', 'D_63_2', 'D_63_3',
       'D_63_4', 'D_63_5', 'D_64_0', 'D_64_1', 'D_64_2', 'D_64_3',
       'D_68_0', 'D_68_1', 'D_68_2', 'D_68_3', 'D_68_4', 'D_68_5',
       'D_68_6'], dtype=object)

In [6]:
cat_col_df = pd.DataFrame(one_hot_labels, columns = cat_col_names)
cat_col_df

Unnamed: 0,B_30_0,B_30_1,B_30_2,B_38_0,B_38_1,B_38_2,B_38_3,B_38_4,B_38_5,B_38_6,...,D_64_1,D_64_2,D_64_3,D_68_0,D_68_1,D_68_2,D_68_3,D_68_4,D_68_5,D_68_6
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724601,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
724602,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
724603,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
724604,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [7]:
df = df.drop(columns = columns.columns)
df = df.join(cat_col_df)

In [8]:
df_x = df.drop(columns=['target'])
df_y = df[['target']]

In [9]:
df = df.sort_values(by=['customer_ID'])

In [10]:
pca_df_x = df_x.drop(columns=['customer_ID', 'S_2'])
imp = impute.IterativeImputer()
imputed_df_x = imp.fit_transform(pca_df_x)

KeyboardInterrupt: 

In [None]:
imp_df_x = pd.DataFrame(imputed_df_x, columns = list(pca_df_x.columns))

In [None]:
from unicodedata import decomposition

pca = decomposition.PCA(n_components=25)
df_x_imp_pca_df = pca.fit.transform(imp_df_x)

In [None]:
df_x_imp_pca_df = pd.DataFrame(df_x_imp_pca_df, columns[f'column_{i}' for i in range(df_x_imp_pca_df.shape[1])])
df_x_imp_pca_df

In [None]:
df_pca = df_y.join(df_x['customer_ID','S_2']).join(df_x_imp_pca_df)
df_pca

In [None]:
clients = [n in _, n in df_pca.groupby('customer_ID', as_index = False)]
df = df_pca.groupby('customer_ID', as_index=False).mean().drop(columns=['Unnamed: 0'])
df

In [None]:
x = df.drop(columns=['S_2', 'customer_ID', 'target'])
y = df['target']

x_train, x_test, y_train, y_test = train_test_split(x,y, test_size= 0.1, random_state= 1)
clf = MLPClassifier(hidden_layer_sizes=(80, 60, 15), max_iter=500, activation = 'relu', solver = 'lbfgs', random_state=1)
clf.fit(x_train, y_train) #test mejor

ValueError: could not convert string to float: 'CO'

In [None]:
y_pred = clf.predict(x_test)
y_pred

array([1, 0])

In [None]:
clf.score(x_test, y_test)