In [32]:
# IMPORT PACKAGES
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
# READ CSV DATA
def read_data_csv(filename):
    return pd.read_csv(filename)

In [3]:
# DROP FEATURES THAT WILL NOT BE USED
def drop_useless_columns(df):
    df_ = df.copy()
    return df_.drop(columns=['Id'])

In [4]:
# SPLITS FEATURES BETWEEN NUMERICAL AND CATEGORICAL
def split_numerical_categorical(df):
    df_ = df.copy()
    df_dtypes = df_.dtypes.to_dict()
    
    feature_numbers = []
    feature_objects = []
    
    for i,j in df_dtypes.items():
        if j.char == 'O':
            feature_objects.append(i)
        else:
            feature_numbers.append(i)
            
    return feature_numbers, feature_objects

In [5]:
# NA FILLER FOR CATEGORICAL AND NUMERICAL DATA
def na_filler_function(df):
    df_ = df.copy()

    feature_numbers, feature_objects = split_numerical_categorical(df_)
            
    if len(feature_objects) > 0:
        df_[feature_objects] = df_[feature_objects].fillna('None').copy()
    if len(feature_numbers) > 0:
        df_[feature_numbers] = df_[feature_numbers].fillna(0).copy()
    
    return df_

In [6]:
# ONE INSTANCE OF OHE
class OHE:
    def __init__(self, column_name, drop=False):
        self.column_name = column_name
        self.drop = drop
        self.new_cols = []
        
    def fit(self, X_, y=None):
        # UNIQUE VALUES ON COLUMN
        self.new_cols = X_[self.column_name].unique()
    
    def transform(self, X_, y=None):
        X = X_.copy()

        # CREATE COLUMNS
        for i in self.new_cols:
            j = X[
                X[self.column_name] == i
            ].index
            
            new_label = self.column_name + '_' + str(i)
            X[new_label] = 0
            X.loc[j, new_label] = 1
            
        # DROP ORIGINAL COLUMN
        if self.drop:
            return X.drop(columns=[self.column_name])
        else:
            return X
    
    def fit_transform(self, X_, y=None):
        self.fit(X_)
        return self.transform(X_)

In [7]:
# CONVERTS ALL CATEGORICAL DATA INTO OHE REPRESENTATION
class OHE_Categorical:
    def __init__(self):
        self.ohe = []
    
    def fit(self, df, y=None):
        df_ = df.copy()
        feature_numbers, feature_objects = split_numerical_categorical(df_)

        for i in feature_objects:
            self.ohe.append(OHE(i, drop=True))
            
        for i in range(len(self.ohe)):
            self.ohe[i].fit(df_)
        
    def transform(self, df, y=None):
        df_ = df.copy()
        for i in range(len(self.ohe)):
            df_ = self.ohe[i].transform(df_)

        return df_
    
    def fit_transform(self, df, y=None):
        self.fit(df)
        return self.transform(df)

In [8]:
# SPLIT X AND Y
def split_x_and_y(df):
    y = df['SalePrice']
    X = df.drop(columns='SalePrice')
    
    return X,y

### LOAD DATA

In [9]:
df = read_data_csv('train.csv')

### PRE-PROCESS DATA

In [54]:
df1 = df.copy()
df1 = na_filler_function(df1)
df1['MSSubClass'] = df1['MSSubClass'].astype('object')
ohe = OHE_Categorical()
df1 = ohe.fit_transform(df1)
X, y = split_x_and_y(df1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
pca = PCA(n_components=30)
x_train = pca.fit_transform(X_train)
x_test = pca.transform(X_test)

In [55]:
elasticnet = ElasticNet()

In [56]:
elasticnet.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


ElasticNet()

In [57]:
y_pred = elasticnet.predict(X_test)

In [58]:
r2_score(y_pred, y_test)

0.8228698106908036

### TEST PREDICTION

In [59]:
df_test = read_data_csv('test.csv')

In [60]:
df1 = df_test.copy()
df1 = na_filler_function(df1)
df1['MSSubClass'] = df1['MSSubClass'].astype('object')
df1 = ohe.transform(df1)

In [61]:
y_pred = elasticnet.predict(df1)

In [64]:
# CREATE SUBMISSION FILE
id = df1['Id'].to_list()
target = y_pred.tolist()

# CREATE DICTIONARY FOR DATAFRAME
dataframe_dictionary = {
    'id': id,
    'SalePrice': target
}

# CREATE DATAFRAME
submission_df = pd.DataFrame(dataframe_dictionary, index=None)
submission_df.to_csv('submission.csv', index=None)