In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
%matplotlib inline

In [None]:
df_train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
df_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
df_train.info()

In [None]:
df_train['SalePrice'].hist()

In [None]:
prices = pd.DataFrame({'price':df_train['SalePrice'],
                       'log(price + 1)':np.log1p(df_train['SalePrice'])})

print(prices,'\n')

In [None]:
print(skew(prices['price']))
print(skew(prices['log(price + 1)']))

In [None]:
plt.rcParams['figure.figsize']=(12.0,6.0)
prices.hist()

In [None]:
df_train['SalePrace'] = np.log1p(df_train['SalePrice'])

In [None]:
all_data = pd.concat((df_train.loc[:,'MSSubClass':'SaleCondition'],
                    df_test.loc[:,'MSSubClass':'SaleCondition']))

print(all_data.shape)
print(all_data)

In [None]:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
print(numeric_feats)

In [None]:
skewed_feats = df_train[numeric_feats].apply(lambda x : skew(x.dropna()))
print(skewed_feats)

In [None]:
skewed_feats = skewed_feats[skewed_feats > 0.75]
print(skewed_feats)

In [None]:
skewed_feats = skewed_feats.index

In [None]:
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data[skewed_feats]

In [None]:
all_data = pd.get_dummies(all_data)

In [None]:
all_data = all_data.fillna(all_data[:df_train.shape[0]].mean())

In [None]:
X_train = all_data[:df_train.shape[0]]
X_test = all_data[df_train.shape[0]:]
y = np.log1p(df_train.SalePrice)

In [None]:
from sklearn.model_selection import cross_val_score#二乗平均平方根

def rmse_cv(model):
    """
    二乗平均平方根誤差 (Root Mean Squared Error, RMSE) を計算します。

    Parameters:
        model (obj): 評価対象の機械学習モデル

    Returns:
        (float): クロスバリデーションによって得られたRMSEのリスト
    """
    rmse = np.sqrt(
        -cross_val_score(
            model,X_train,y,
            scoring = "neg_mean_squared_error",#平均2乗誤差
            cv = 5#データを5分割
        )
    )
    return(rmse)

In [None]:
#リッジ回帰で学習
from sklearn.linear_model import Ridge

#リッジ回帰モデルを作成
model_ridge = Ridge()

#L2正則化パターンを10個用意
alphas = [0.05,0.1,0.1,1,5,10,15,30,50,75]
#クロスバリテーションで二乗平均誤差を求め、その平均を取得
cv_ridge = [rmse_cv(Ridge(alpha = alpha)).mean()
            for alpha in alphas]

In [None]:
cv_ridge = pd.Series(cv_ridge,index = alphas)
#スコアを出力
print(cv_ridge,'\n')
#平均
print(cv_ridge.mean(),'\n')

In [None]:
#正則化の強度別のスコアをグラフ化
plt.figure(figsize = (10,5))
plt.plot(cv_ridge)
plt.grid()
plt.title('Valitation - b regularization strength')
plt.xlabel('Alpha')
plt.ylabel('RSME')
plt.show

In [None]:
#ラッソ回帰で学習
from sklearn.linear_model import LassoCV

model_lasso = LassoCV(
    alphas = [1,0.1,0.001,0.0005]).fit(X_train,y)

print(rmse_cv(model_lasso))

In [None]:
print(rmse_cv(model_lasso).mean())
print(rmse_cv(model_lasso).min())
print(model_lasso.alpha_)

In [None]:
#勾配ブースティングで学習
import xgboost as xgb

dtrain = xgb.DMatrix(X_train,label = y)

params = {"max_depth":3,"eta":0.1}

cross_val = xgb.cv(
    params,
    dtrain,
    num_boost_round = 1000,
    early_stopping_rounds=50)
cross_val

In [None]:
plt.figure(figsize=(8,6))
plt.plot(cross_val.loc[10:,["train-rmse-mean","test-rmse-mean"]])
plt.grid()
plt.xlabel('num_boost_round')
plt.ylabel('RMSE')
plt.show()

In [None]:
model_xgb = xgb.XGBRegressor(
    n_estimators = 236,
    max_depth=3,
    learning_rate = 0.1)
model_xgb.fit(X_train,y)

print(rmse_cv(model_xgb).mean())

In [None]:
lasso_preds = np.expm1(model_lasso.predict(X_test))
xgb_preds = np.expm1(model_xgb.predict(X_test))

In [None]:
preds = lasso_preds *0.7+xgb_preds*0.3
solution = pd.DataFrame({"id":df_test.Id, "SalePrice":preds})
solution.to_csv("ridge_sol.csv",index = False)

print()