# 1. Importando librerías

In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import psycopg2 as pg2
import csv

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, roc_curve, roc_auc_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

# 2. Leyendo archivos

In [20]:
# datos de entrenamiento
df_train = pd.read_csv('train_cupid.csv')

for col in df_train.columns:
    df_train[col] = df_train[col].astype(int)

In [21]:
# guardamos columnas
columns_all = df_train.columns

In [3]:
# datos de validacion
df_test = pd.read_csv('test_cupid.csv')

for col in df_train.columns:
    df_test[col] = df_test[col].astype(int)

In [4]:
# saving sets
df_train.to_csv('train_cupid.csv', index=False)
df_test.to_csv('test_cupid.csv', index=False)

In [5]:
#print(pd.io.sql.get_schema(df_train.reset_index(), 'train_data'))

# 3. Creando base de datos e importando tablas

In [6]:
# conectando
user = 'postgres'
password = 'moeg231@'
conn_db = pg2.connect(f"user={user} password={password}")

In [9]:
# creando BD
conn_db.autocommit = True
cursor = conn_db.cursor()
cursor.execute('CREATE DATABASE apellido_nombre')

In [26]:
# creando tablas
for table_ in ['train_data', 'test_data']:
    cursor.execute(f'create table {table_} (\
            "age" INTEGER,\
            "height" INTEGER,\
            "virgo" INTEGER,\
            "taurus" INTEGER,\
            "scorpio" INTEGER,\
            "pisces" INTEGER,\
            "libra" INTEGER,\
            "leo" INTEGER,\
            "gemini" INTEGER,\
            "aries" INTEGER,\
            "aquarius" INTEGER,\
            "cancer" INTEGER,\
            "sagittarius" INTEGER,\
            "asian" INTEGER,\
            "hispanic_latin" INTEGER,\
            "black" INTEGER,\
            "indian" INTEGER,\
            "pacific_islander" INTEGER,\
            "native_american" INTEGER,\
            "middle_eastern" INTEGER,\
            "colorado" INTEGER,\
            "new_york" INTEGER,\
            "oregon" INTEGER,\
            "arizona" INTEGER,\
            "hawaii" INTEGER,\
            "montana" INTEGER,\
            "wisconsin" INTEGER,\
            "virginia" INTEGER,\
            "spain" INTEGER,\
            "nevada" INTEGER,\
            "illinois" INTEGER,\
            "vietnam" INTEGER,\
            "ireland" INTEGER,\
            "louisiana" INTEGER,\
            "michigan" INTEGER,\
            "texas" INTEGER,\
            "united_kingdom" INTEGER,\
            "massachusetts" INTEGER,\
            "north_carolina" INTEGER,\
            "idaho" INTEGER,\
            "mississippi" INTEGER,\
            "new_jersey" INTEGER,\
            "florida" INTEGER,\
            "minnesota" INTEGER,\
            "georgia" INTEGER,\
            "utah" INTEGER,\
            "washington" INTEGER,\
            "west_virginia" INTEGER,\
            "connecticut" INTEGER,\
            "tennessee" INTEGER,\
            "rhode_island" INTEGER,\
            "district_of_columbia" INTEGER,\
            "canada" INTEGER,\
            "missouri" INTEGER,\
            "germany" INTEGER,\
            "pennsylvania" INTEGER,\
            "netherlands" INTEGER,\
            "switzerland" INTEGER,\
            "mexico" INTEGER,\
            "ohio" INTEGER,\
            "agnosticism" INTEGER,\
            "atheism" INTEGER,\
            "catholicism" INTEGER,\
            "buddhism" INTEGER,\
            "judaism" INTEGER,\
            "hinduism" INTEGER,\
            "islam" INTEGER,\
            "pro_dogs" INTEGER,\
            "pro_cats" INTEGER,\
            "spanish" INTEGER,\
            "chinese" INTEGER,\
            "french" INTEGER,\
            "german" INTEGER,\
            "single" INTEGER,\
            "seeing_someone" INTEGER,\
            "available" INTEGER,\
            "employed" INTEGER,\
            "income_between_25_50" INTEGER,\
            "income_between_50_75" INTEGER,\
            "income_over_75" INTEGER,\
            "drugs_often" INTEGER,\
            "drugs_sometimes" INTEGER,\
            "drinks_not_at_all" INTEGER,\
            "drinks_often" INTEGER,\
            "drinks_rarely" INTEGER,\
            "drinks_socially" INTEGER,\
            "drinks_very_often" INTEGER,\
            "orientation_gay" INTEGER,\
            "orientation_straight" INTEGER,\
            "sex_m" INTEGER,\
            "smokes_sometimes" INTEGER,\
            "smokes_trying_to_quit" INTEGER,\
            "smokes_when_drinking" INTEGER,\
            "smokes_yes" INTEGER,\
            "body_type_overweight" INTEGER,\
            "body_type_regular" INTEGER,\
            "education_high_school" INTEGER,\
            "education_undergrad_university" INTEGER);')

In [27]:
# set de columnas
train_set = 'train_data values ('
test_set = 'test_data values ('

for col in df_train.columns[df_train.columns != 'index']:
    if col != 'education_undergrad_university':
        train_set += '%s' + ', '
        test_set += '%s' + ', '
    else:
        train_set += '%s'
        test_set += '%s'

train_set += ')'
test_set += ')'

In [28]:
# ingestamos datos de entrenamiento y validacion a las tablas respectivas en nuestra base de datos
for set_ in ['train_cupid', 'test_cupid']:
    with open(f'./{set_}.csv', 'r') as file:
        reader = csv.reader(file)
        next(reader)
        
        if set_ == 'train_cupid':
            insert_string = train_set
        else:
            insert_string = test_set

        for row in reader:
            cursor.execute(f"INSERT INTO {insert_string}", row)

In [29]:
conn_db.commit()

# 4. Entrenamiento de los modelos

In [45]:
# conjunto de entrenamiento
cursor.execute("SELECT * FROM train_data;")
columnas_train = cursor.fetchall()
X_train = pd.DataFrame(list(columnas_train))
X_train.columns = columns_all

In [46]:
# separando variables objetivo
y_single_train = X_train['single']
y_seeing_train = X_train['seeing_someone']
y_aval_train = X_train['available']

In [47]:
# eliminando variables objetivo de la matriz de atributos
X_train = X_train.drop(['single', 'seeing_someone', 'available'], axis=1)

In [48]:
# definicion de modelos a implementar
model_dict = {
    'logistic': LogisticRegression(), 
    'dec_tree': DecisionTreeClassifier(max_depth=5), 
    'rdm_forest': RandomForestClassifier(max_depth=5),
    'grad_boost': GradientBoostingClassifier(),  
    'ada_boost': AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=5)),
    'bernoulli': BernoulliNB(), 
    'svc': SVC(kernel='rbf')
    }

In [None]:
for name_, model_ in model_dict.items():
    for var_ in [y_single_train, y_seeing_train, y_aval_train]:
        
        'model_{}_{}'.format( name_, var_) = model_.fit(X_train, var_)