# Parte 3 - Modelo 2

## Importo librerias y Data

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import RandomizedSearchCV

In [None]:
train_transaction = pd.read_csv("/content/gdrive/MyDrive/Orga_Datos/Data/train_transaction.csv")
train_identity = pd.read_csv("/content/gdrive/MyDrive/Orga_Datos/Data/train_identity.csv")

In [None]:
df_train = train_transaction.merge(train_identity, left_on="TransactionID", right_on="TransactionID", how="outer")
train_transaction = None
train_identity = None

## Separo Train y Validation

In [None]:
quantile_70 = df_train.TransactionDT.quantile(.7)

In [None]:
X_train= df_train.loc[df_train['TransactionDT'] < quantile_70]

In [None]:
Y_train = X_train["isFraud"]

In [None]:
X_train['TransactionDT'] = X_train['TransactionDT']/60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
X_validation = df_train.loc[df_train['TransactionDT'] > quantile_70]

In [None]:
Y_validation = X_validation["isFraud"]

In [None]:
X_validation['TransactionDT'] = X_validation['TransactionDT']/60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
quantile_70 = None
df_train = None

## Train

### One Hot Encoding

In [None]:
low_card = ['ProductCD','card4','card6','M1','M2','M3','M4','M5','M6','M7','M8','M9','id_12','id_15','id_16','id_23','id_27','id_28','id_29','id_34','id_35','id_36','id_37','id_38'
,'DeviceType']

In [None]:
dum_train = pd.get_dummies(X_train[low_card])

In [None]:
X_train = pd.concat([X_train,dum_train],axis=1)
dum_train = None

### Vectorizer

In [None]:
X_train.id_31.replace(np.nan,"unknown",inplace=True)

In [None]:
vectorizer = CountVectorizer(max_features=25)
X = vectorizer.fit_transform(X_train.id_31)

In [None]:
X_train[vectorizer.get_feature_names_out()] = X.toarray()

### Mean Encoding

In [None]:
high_card = ['P_emaildomain','R_emaildomain','id_30','id_33','DeviceInfo']

In [None]:
for i in high_card:
  Mean_encoded_subject = X_train.groupby([i])['isFraud'].mean().to_dict()
  X_train[i] =  X_train[i].map(Mean_encoded_subject)

### Drops

In [None]:
X_train.drop(["isFraud","id_31"], axis=1, inplace=True)
X_train.drop(low_card, axis=1, inplace=True)

### Nulls

In [None]:
X_train.replace(np.nan,0,inplace=True)

## Fit Model

In [None]:
rf_model = RandomForestClassifier(random_state=1)

In [None]:
param_dist = {'n_estimators': [i for i in range(75,126)]}

In [None]:
clf = RandomizedSearchCV(rf_model,param_distributions = param_dist, n_iter=1, cv=2, random_state=1,scoring='roc_auc',n_jobs=1)

In [None]:
clf.fit(X_train, Y_train)

RandomizedSearchCV(cv=2, estimator=RandomForestClassifier(random_state=1),
                   n_iter=1, n_jobs=1,
                   param_distributions={'n_estimators': [75, 76, 77, 78, 79, 80,
                                                         81, 82, 83, 84, 85, 86,
                                                         87, 88, 89, 90, 91, 92,
                                                         93, 94, 95, 96, 97, 98,
                                                         99, 100, 101, 102, 103,
                                                         104, ...]},
                   random_state=1, scoring='roc_auc')

## Validation

### One Hot Encoding

In [None]:
dum_val = pd.get_dummies(X_validation[low_card])

In [None]:
X_validation = pd.concat([X_validation,dum_val],axis=1)
dum_val = None

### Vectorizer

In [None]:
X_validation.id_31.replace(np.nan,"unknown",inplace=True)

In [None]:
vectorizer = CountVectorizer(max_features=25)
X = vectorizer.fit_transform(X_validation.id_31)

In [None]:
X_validation[vectorizer.get_feature_names_out()] = X.toarray()

### Mean Encoding

In [None]:
for i in high_card:
  X_validation[i] =  X_validation[i].map(Mean_encoded_subject)

### Drops

In [None]:
X_validation.drop(["52","59","60","66","ios","isFraud","id_31"], axis=1, inplace=True)
X_validation.drop(low_card, axis=1, inplace=True)

### Nulls

In [None]:
X_validation[["card6_charge card","id_34_match_status:-1","15","57","58","61","62"]] = 0

In [None]:
X_validation = X_validation[list(X_train.columns.values)]

In [None]:
X_validation.replace(np.nan,0,inplace=True)

## Predict

In [None]:
roc_auc_score(Y_validation,clf.predict_proba(X_validation)[:,1])

0.8809431991860457

## Test

In [None]:
test_transaction = pd.read_csv("/content/gdrive/MyDrive/Orga_Datos/Data/test_transaction.csv")
test_identity = pd.read_csv("/content/gdrive/MyDrive/Orga_Datos/Data/test_identity.csv")

In [None]:
df_test = test_transaction.merge(test_identity, left_on="TransactionID", right_on="TransactionID", how="outer")

In [None]:
test_transaction = None
test_identity = None

In [None]:
df_test.columns = df_test.columns.str.replace("id-", "id_")

In [None]:
df_test["TransactionDT"] = df_test["TransactionDT"]/60

In [None]:
X_test = df_test
df_test = None

In [None]:
dum_df = pd.get_dummies(X_test[low_card])

In [None]:
X_test.drop(low_card, axis=1, inplace=True)

In [None]:
X_test = pd.concat([X_test,dum_df],axis=1)

In [None]:
for high_card in high_card:
  X_test[high_card] = X_test[high_card].map(Mean_encoded_subject)
Mean_encoded_subject = None

In [None]:
dum_df = None

In [None]:
X_test.replace(np.nan,0,inplace=True)

In [None]:
X_test[['10', 'safari', 'mobile', '63', '64', 'id_34_match_status:0', 'samsung', '62', '61', '65', 'id_34_match_status:-1', '15', '58', '16', '49', 'ie', 'android', 'chrome', 'firefox', 'unknown', 'generic', 'edge', 'for', 'card6_debit or credit', '57', 'desktop', 'browser', '11']]=0

In [None]:
X_test = X_test[list(X_train.columns.values)]

## Preds

In [None]:
resultados = clf.predict_proba(X_test)

In [None]:
df_resultados = pd.DataFrame(resultados)

In [None]:
resulado_final = pd.concat([X_test.TransactionID,df_resultados.iloc[:,1]], axis=1)

In [None]:
resulado_final.columns = ['TransactionID', 'isFraud']

In [None]:
resulado_final.head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.010309
1,3663550,0.041237
2,3663551,0.041237
3,3663552,0.010309
4,3663553,0.041237


In [None]:
resulado_final.to_csv('preds_RF.csv', index=False)