In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

## Mostrando o dataset

In [2]:
admission_df = pd.read_csv('Admission_Predict.csv')
admission_df.columns = ['id', 'GRE_Score', 'TOEFL_Score','University_Rating','SOP'
                        ,'LOR','CGPA','Research','Chance_Admit']
admission_df.head()

Unnamed: 0,id,GRE_Score,TOEFL_Score,University_Rating,SOP,LOR,CGPA,Research,Chance_Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [3]:
admission_df.drop('id', axis=1, inplace=True)

## Descrevendo o dataset

In [4]:
admission_df.describe()

Unnamed: 0,GRE_Score,TOEFL_Score,University_Rating,SOP,LOR,CGPA,Research,Chance_Admit
count,400.0,400.0,400.0,400.0,400.0,400.0,400.0,400.0
mean,316.8075,107.41,3.0875,3.4,3.4525,8.598925,0.5475,0.72435
std,11.473646,6.069514,1.143728,1.006869,0.898478,0.596317,0.498362,0.142609
min,290.0,92.0,1.0,1.0,1.0,6.8,0.0,0.34
25%,308.0,103.0,2.0,2.5,3.0,8.17,0.0,0.64
50%,317.0,107.0,3.0,3.5,3.5,8.61,1.0,0.73
75%,325.0,112.0,4.0,4.0,4.0,9.0625,1.0,0.83
max,340.0,120.0,5.0,5.0,5.0,9.92,1.0,0.97


## Verificando a relação entre eles

In [5]:
admission_df.corr()

Unnamed: 0,GRE_Score,TOEFL_Score,University_Rating,SOP,LOR,CGPA,Research,Chance_Admit
GRE_Score,1.0,0.835977,0.668976,0.612831,0.557555,0.83306,0.580391,0.80261
TOEFL_Score,0.835977,1.0,0.69559,0.657981,0.567721,0.828417,0.489858,0.791594
University_Rating,0.668976,0.69559,1.0,0.734523,0.660123,0.746479,0.447783,0.71125
SOP,0.612831,0.657981,0.734523,1.0,0.729593,0.718144,0.444029,0.675732
LOR,0.557555,0.567721,0.660123,0.729593,1.0,0.670211,0.396859,0.669889
CGPA,0.83306,0.828417,0.746479,0.718144,0.670211,1.0,0.521654,0.873289
Research,0.580391,0.489858,0.447783,0.444029,0.396859,0.521654,1.0,0.553202
Chance_Admit,0.80261,0.791594,0.71125,0.675732,0.669889,0.873289,0.553202,1.0


## Função com todos os metodos de regressão 

In [6]:
def modelRegress(a,b,c,d):
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import Ridge
    from sklearn.linear_model import Lasso
    from sklearn.linear_model import ElasticNet
    
    x_treino = a
    y_treino = b
    x_teste = c
    y_teste = d
    
    reg = LinearRegression()
    ridge = Ridge(alpha=10.0)
    lasso = Lasso(alpha=1,max_iter=1000, tol=0.2)
    elastic = ElasticNet(alpha=1, l1_ratio=0.5, max_iter=1000, tol=0.1)
    
    reg.fit(x_treino, y_treino)
    ridge.fit(x_treino, y_treino)
    lasso.fit(x_treino, y_treino)
    elastic.fit(x_treino, y_treino)
    resul_reg = reg.score(x_teste,y_teste)
    resul_ridge = ridge.score(x_teste,y_teste)
    resul_lasso = lasso.score(x_teste,y_teste)
    resul_elastic = elastic.score(x_teste,y_teste)
    dic_regmodels = {'Linear': resul_reg,'Ridge':resul_ridge,'Lasso':resul_lasso,'Elastic': resul_elastic}
    the_best_model = max(dic_regmodels, key=dic_regmodels.get)
    print('Regressão Linear:',resul_reg, 'Regressão Ridge', resul_ridge, 'Regressão Lasso', resul_lasso, 'Regressão Elastic Net:', resul_elastic)
    print('o melhor modelo foi:', the_best_model, 'com o valor:', dic_regmodels[the_best_model])

## Função para o KFold

In [38]:
def modelKfold(a,b):
    from sklearn.model_selection import cross_val_score
    from sklearn.model_selection import KFold
    from sklearn.linear_model import LinearRegression
    model = LinearRegression()
    kfold = KFold(n_splits=5, shuffle=True) # n_splits serve para divitdir em n colunas de teste e shuffle é para deixar random os dados
    result = cross_val_score(model, a,b , cv= kfold)
    print('Resultado dos testes: ', result)
    print('A media do resultado gerou um R2:',result.mean())

## Separando os dados para o treino

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X = admission_df.drop('Chance_Admit', axis=1)
y = admission_df['Chance_Admit']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

## Usando a função de Regressão

In [42]:
modelRegress(X_train,y_train,X_test,y_test)

Regressão Linear: 0.8651715881914968 Regressão Ridge 0.8623653346950542 Regressão Lasso 0.2829670410652184 Regressão Elastic Net: 0.5596175125690105
o melhor modelo foi: Linear com o valor: 0.8651715881914968


## Criando testes de variaveis para o ElasticNet com o Randomized

In [45]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import ElasticNet

In [47]:
#dados que serão de teste
value = {'alpha':[0.1,0.5,1,2,5,10,25,50,100,150,200,300,500,750,1000,1500,2000,3000,5000],'l1_ratio':[0.02,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]}

In [52]:
#criando modelo
model = ElasticNet()
search = RandomizedSearchCV(estimator= model,param_distributions= value, n_iter=209, cv=5)# pode informar a quantidade de interações
search.fit(X,y)

RandomizedSearchCV(cv=5, estimator=ElasticNet(), n_iter=209,
                   param_distributions={'alpha': [0.1, 0.5, 1, 2, 5, 10, 25, 50,
                                                  100, 150, 200, 300, 500, 750,
                                                  1000, 1500, 2000, 3000,
                                                  5000],
                                        'l1_ratio': [0.02, 0.05, 0.1, 0.2, 0.3,
                                                     0.4, 0.5, 0.6, 0.7, 0.8,
                                                     0.9]})

In [53]:
print('Melhor score:', search.best_score_)
print('Melhor Alpha:', search.best_estimator_.alpha)
print('Melhor l1_ratio:', search.best_estimator_.l1_ratio)

Melhor score: 0.7408292165331447
Melhor Alpha: 0.1
Melhor l1_ratio: 0.02


## Criando testes de variaveis com GridSearchCV

In [54]:
from sklearn.model_selection import GridSearchCV

In [57]:
modelGrid = ElasticNet()
searchGrid = GridSearchCV(estimator=model, param_grid=value, cv=5)#  faz todas interações possiveis
searchGrid.fit(X,y)
print('Melhor score:', searchGrid.best_score_)
print('Melhor Alpha:', searchGrid.best_estimator_.alpha)
print('Melhor l1_ratio:', searchGrid.best_estimator_.l1_ratio)

Melhor score: 0.7408292165331447
Melhor Alpha: 0.1
Melhor l1_ratio: 0.02


## Usando o KFlod para obter um resultado mais robusto

In [43]:
modelKfold(X,y)

Resultado dos testes:  [0.7739137  0.81378637 0.80877522 0.80982407 0.75634586]
A media do resultado gerou um R2: 0.7925290446868971
