In [1]:
import pandas as pd
import warnings
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
import numpy as np

warnings.filterwarnings('ignore')

In [2]:
df_new = pd.read_csv("train.csv")
df_new.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
def ridge(filename, alpha):
    df = pd.read_csv(filename)
    df = df.drop("Id", axis=1)
    y = df["SalePrice"]
    x = df.drop("SalePrice", axis=1)

    x = x.fillna(x.mean())

    catagorical_data = x.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    for i in catagorical_data:
        x[i] = le.fit_transform(x[i])
    
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    ridge = Ridge(alpha=alpha)
    ridge.fit(x_train, y_train)
    y_pred = ridge.predict(x_test)
    print("RMSE for Ridge Regression: ", sqrt(mean_squared_error(y_test, y_pred)))
    print("R square value for Ridge Regression: ", r2_score(y_test, y_pred))
    
ridge("train.csv", 10)

RMSE for Ridge Regression:  35956.79698707077
R square value for Ridge Regression:  0.8314424687234322


In [4]:
def lasso_regression(filename, alpha):
    df = pd.read_csv(filename)
    df = df.drop("Id", axis=1)
    y = df["SalePrice"]
    x = df.drop("SalePrice", axis=1)

    x = x.fillna(x.mean())

    catagorical_data = x.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    for i in catagorical_data:
        x[i] = le.fit_transform(x[i])
    
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    lasso = Lasso(alpha=alpha)
    lasso.fit(x_train, y_train)
    y_pred = lasso.predict(x_test)
    print("RMSE for Lasso Regression: ", sqrt(mean_squared_error(y_test, y_pred)))
    print("R square value for Lasso Regression: ", r2_score(y_test, y_pred))

lasso_regression("train.csv", 0.1)

RMSE for Lasso Regression:  34615.68869497832
R square value for Lasso Regression:  0.8437816239542865


In [5]:
def mlr(filename):
    df = pd.read_csv(filename)
    df = df.drop("Id", axis=1)
    y = df["SalePrice"]
    x = df.drop("SalePrice", axis=1)

    x = x.fillna(x.mean())

    catagorical_data = x.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    for i in catagorical_data:
        x[i] = le.fit_transform(x[i])
    
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    mlr = LinearRegression()
    mlr.fit(x_train, y_train)
    y_pred = mlr.predict(x_test)
    print("RMSE for MLR: ", sqrt(mean_squared_error(y_test, y_pred)))
    print("R square value for MLR: ", r2_score(y_test, y_pred))

mlr("train.csv")

RMSE for MLR:  35724.83567058214
R square value for MLR:  0.833610220841164


In [6]:
def find_best_alpha_ridge(filename):
    df = pd.read_csv(filename)
    df = df.drop("Id", axis=1)
    y = df["SalePrice"]
    x = df.drop("SalePrice", axis=1)

    x = x.fillna(x.mean())

    catagorical_data = x.select_dtypes(include=['object']).columns
    le = LabelEncoder()
    for i in catagorical_data:
        x[i] = le.fit_transform(x[i])
    
    scaler = MinMaxScaler()
    x = scaler.fit_transform(x)

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    alpha = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
    rmse = []
    r2 = []
    for i in alpha:
        ridge = Ridge(alpha=i)
        ridge.fit(x_train, y_train)
        y_pred = ridge.predict(x_test)
        rmse.append(sqrt(mean_squared_error(y_test, y_pred)))
        r2.append(r2_score(y_test, y_pred))
    print("RMSE for Ridge Regression: ", rmse, '\n')
    print("R square value for Ridge Regression: ", r2 ,'\n')
    print("Best alpha value for Ridge Regression: ", alpha[r2.index(max(r2))])

    
find_best_alpha_ridge("train.csv")

RMSE for Ridge Regression:  [34616.535058951406, 34612.77573275271, 34576.991039059845, 34344.96761642196, 34199.51097581455, 35956.79698707077, 44825.15236481994, 68097.06268049889] 

R square value for Ridge Regression:  [0.8437739846885787, 0.8438079148702616, 0.8441307087832884, 0.8462155625835306, 0.8475154094634288, 0.8314424687234322, 0.7380431770103069, 0.39543510661036674] 

Best alpha value for Ridge Regression:  1


In [12]:
def top_10_independent_variables(filename, VarName):
    df = pd.read_csv(filename)
    df = df.drop("Id", axis=1)
    y = df["SalePrice"]
    # take column name from VarName
    x = df[VarName]
    null_value = df.isnull().values.any()
    
    if null_value:
        df = df.dropna()
        #x = x.fillna(x.mean())
    

    # check if x is catagorical or not if yes then convert it into numerical
    # x is single column so we can use x.dtype
    if x.dtype == 'object':
        le = LabelEncoder()
        x = le.fit_transform(x)

    # scale the data
    scaler = MinMaxScaler()
    #x = x.reshape(-1, 1)


    x = scaler.fit_transform(x)
    
    # split the data
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    # fit the model
    mlr = LinearRegression()
    mlr.fit(x_train, y_train)
    y_pred = mlr.predict(x_test)
    print("RMSE for MLR: ", sqrt(mean_squared_error(y_test, y_pred)))
    print("R square value for MLR: ", r2_score(y_test, y_pred))
    

columns = df_new.columns
for i in range(1,len(columns)-1):
    top_10_independent_variables("train.csv", columns[i])

ValueError: Expected 2D array, got 1D array instead:
array=[60. 20. 60. ... 70. 20. 20.].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.