In [1]:
import utils as Util

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# データを読み込む（例としてdfというDataFrameを仮定）
train = pd.read_csv("./train.csv")

train["horsepower"] = Util.transform_missing_value(train["horsepower"], "?")

# 特徴量とターゲット変数を分ける
X = train[["cylinders","displacement","horsepower","weight","acceleration","model year","origin"]]
y = train["mpg"]

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 線形回帰モデルを作成して訓練
model = LinearRegression()
model.fit(X_train, y_train)

# テストデータで予測
y_pred = model.predict(X_test)

# 二乗平均平方根誤差を評価
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Mean Squared Error:", rmse)


Mean Squared Error: 5.110136995294997


In [22]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train = pd.read_csv("./train.csv")
train["horsepower"] = Util.transform_missing_value(train["horsepower"], "?")

# 特徴量とターゲット変数を分ける
X = train[["cylinders","displacement","horsepower","weight","acceleration","model year","origin"]]
y = train["mpg"]

# Ridge回帰モデルを作成
ridge_model = Ridge()

# 調整したいハイパーパラメータの範囲を指定
param_grid = {
    "alpha": [0.01, 0.1, 1.0, 10.0]  # alphaはRidge回帰の正則化項の強さを制御するハイパーパラメータ
}

# グリッドサーチを使用して最適なハイパーパラメータを探索
grid_search = GridSearchCV(ridge_model, param_grid, cv=5)
grid_search.fit(X, y)

# 最適なハイパーパラメータを表示
print("Best alpha:", grid_search.best_params_["alpha"])

# 最適なハイパーパラメータを使ってモデルを再構築
best_model = grid_search.best_estimator_

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42)
best_model.fit(X_train, y_train)

# テストデータで予測
y_pred = model.predict(X_test)

# 二乗平均平方根誤差を評価
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Mean Squared Error:", rmse)

Best alpha: 10.0
Mean Squared Error: 4.311192703702179


In [7]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train = pd.read_csv("./train.csv")
train["horsepower"] = Util.transform_missing_value(train["horsepower"], "?")

# 特徴量とターゲット変数を分ける
X = train[["cylinders","displacement","horsepower","weight"]]
y = train["mpg"]

# Ridge回帰モデルを作成
ridge_model = Ridge()

# 調整したいハイパーパラメータの範囲を指定
param_grid = {
    "alpha": [0.01, 0.1, 1.0, 10.0]  # alphaはRidge回帰の正則化項の強さを制御するハイパーパラメータ
}

# グリッドサーチを使用して最適なハイパーパラメータを探索
grid_search = GridSearchCV(ridge_model, param_grid, cv=5)
grid_search.fit(X, y)

# 最適なハイパーパラメータを表示
print("Best alpha:", grid_search.best_params_["alpha"])

# 最適なハイパーパラメータを使ってモデルを再構築
best_model = grid_search.best_estimator_

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
best_model.fit(X_train, y_train)

# テストデータで予測
y_pred = best_model.predict(X_test)

# 二乗平均平方根誤差を評価
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Mean Squared Error:", rmse)

Best alpha: 10.0
Mean Squared Error: 4.80807260914431


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error

train = pd.read_csv("./train.csv")
train["horsepower"] = Util.transform_missing_value(train["horsepower"], "?")

# 特徴量とターゲット変数を分ける
X = train[["cylinders","displacement","horsepower","weight","acceleration","model year","origin"]]
y = train["mpg"]

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

# モデルをリストで定義
models = [
    LinearRegression(), # 線形回帰
    DecisionTreeRegressor(random_state=42), # 決定木
    RandomForestRegressor(random_state=42), # ランダムフォレスト
    SVR(), # サポートベクターマシン（SVM）
    MLPRegressor(random_state=42) # ニューラルネットワーク
]

# モデルごとに繰り返し処理
for model in models:
    model_name = model.__class__.__name__
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

    # テストデータで予測
    y_pred = model.predict(X_test)

    # 平均二乗誤差を評価
    mse = mean_squared_error(y_test, y_pred)
    print(f"{model_name} - Mean Squared Error: {mse}")
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Root Mean Squared Error:", rmse)


Training LinearRegression...
LinearRegression - Mean Squared Error: 20.78220295708337
Root Mean Squared Error: 4.55875015295677
Training DecisionTreeRegressor...
DecisionTreeRegressor - Mean Squared Error: 18.606381470018444
Root Mean Squared Error: 4.313511501088
Training RandomForestRegressor...
RandomForestRegressor - Mean Squared Error: 12.471978301199021
Root Mean Squared Error: 3.5315688158662604
Training SVR...
SVR - Mean Squared Error: 43.973197305980875
Root Mean Squared Error: 6.631228943867107
Training MLPRegressor...
MLPRegressor - Mean Squared Error: 754.2665598445003
Root Mean Squared Error: 27.463913775070374


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

train = pd.read_csv("./train.csv")
train["horsepower"] = Util.transform_missing_value(train["horsepower"], "?")

# 特徴量とターゲット変数を分ける
X = train[["cylinders","displacement","horsepower","weight","acceleration","model year","origin"]]
y = train["mpg"]

# ランダムフォレスト回帰モデルを作成
random_model = RandomForestRegressor()

# 調整したいハイパーパラメータの範囲を指定
param_grid = {
    'n_estimators'  : [4, 8, 16, 32, 64, 100, 200],
    'max_depth' : [8, 16, 32, 64, 100, 150],
    'n_jobs': [-1],
    'random_state': [None, 10, 21, 42, 84]
}

# グリッドサーチを使用して最適なハイパーパラメータを探索
grid_search = GridSearchCV(random_model, param_grid, cv=5)
grid_search.fit(X, y)

# 最適なハイパーパラメータを表示
print("Best paramater:", grid_search.best_params_)

# 最適なハイパーパラメータを使ってモデルを再構築
best_model = grid_search.best_estimator_

# 訓練データとテストデータに分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)
best_model.fit(X_train, y_train)

# テストデータで予測
y_pred = best_model.predict(X_test)

# 二乗平均平方根誤差を評価
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Mean Squared Error:", rmse)

Best paramater: {'max_depth': 8, 'n_estimators': 64, 'n_jobs': -1, 'random_state': None}
Mean Squared Error: 3.1908399527203355


In [None]:
Best paramater: {'max_depth': 150, 'n_estimators': 64, 'n_jobs': -1}
Mean Squared Error: 3.4392213424153613