# Linear Regression

#### Implement regression and accuracy calculation.

In [1]:
import pandas as pd
import numpy as np
import sys
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [2]:
def generate_model(data, y, features):
    """
    Generate linear regression model coefficients
    
    Params:
        data: a dataframe containing intercept and explanatory variables
        y: a series of the response variable
        features: array of features to include ('intercept' is automatically added)
                    
    Returns:
        An array of computed model coefficients
    """
    
    features = ["intercept"] + features
    mat_data = data.as_matrix(features)
    betas = (np.matmul(np.matmul(np.linalg.inv(
        np.matmul(mat_data.T, mat_data)), mat_data.T), y.as_matrix()))
    return betas

In [3]:
def calc_accuracy(X_test, y, coef, features, R2=False):
    """
    Calculate accuracy metric for a model
    
    Params:
        X_test: a dataframe containing intercept and explanatory variables
        y: a series of the response variable
        coef: array containing model coefficients
        features: array of features to include ('intercept' is automatically added)
        R2: boolean of whether to print R^2
        
    Returns:
        A float representing prediction accuracy (MSE)
    """

    features = ["intercept"] + features
    mat_dat = X_test.as_matrix(features)
    n = len(X_test)
    var1 = np.matmul(mat_dat, coef.T)
    residuals = y.as_matrix() - var1
    sst = y.var() * len(y)
    sse = np.sum(residuals**2) 
    if R2:
        print("R^2 = " + str(1 - sse/sst))
    return sse/n

#### Import and set up dataset.

In [4]:
df = pd.read_csv("data.csv",index_col=0)
for col in df:
    df[col] = (df[col] - df[col].min())/(df[col].max() - df[col].min())

In [5]:
y = df['price_doc'].copy()
X = df.drop('price_doc',axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y)
X_traini = X_train.copy()
X_traini['intercept'] = 1
X_testi = X_test.copy()
X_testi['intercept'] = 1
feat = list(X.columns[:len(X.columns)-1])

In [9]:
X.columns

Index(['full_sq', 'life_sq', 'floor', 'num_room', 'state', 'product_type',
       'raion_popul', 'green_zone_part', 'indust_part', 'children_preschool',
       'children_school', 'healthcare_centers_raion',
       'university_top_20_raion', 'culture_objects_top_25_raion',
       'shopping_centers_raion', 'oil_chemistry_raion', 'radiation_raion',
       'railroad_terminal_raion', 'big_market_raion', 'nuclear_reactor_raion',
       'detention_facility_raion', 'work_all', 'ekder_all', 'park_km',
       'public_transport_station_km', 'big_road1_km', 'fitness_km',
       'big_church_count_5000', 'mosque_count_5000', 'cafe_avg_price_5000',
       'office_count_5000', 'ecology', 'salary', 'cpi', 'usdrub',
       'mortgage_rate', 'unemployment', 'bandwidth_sports',
       'rent_price_2room_eco'],
      dtype='object')

#### Run linear regression.

In [10]:
betas = generate_model(X_traini,y_train,feat)

In [11]:
calc_accuracy(X_testi, y_test, betas, feat,R2=True)

R^2 = 0.46160285118697375


0.0011031736542436797

#### Compare to scikit-learn.

In [12]:
linreg = LinearRegression()
fit = linreg.fit(X_train,y_train)
fit.score(X_test,y_test)

0.4614794639603329

R^2 is very close to scikit-learn's.

#### Optimize feature set.

Based on prediction accuracy with 5-fold cross validation

In [15]:
last_err = sys.maxsize    # Last model prediction accuracy
model_err = sys.maxsize   # Current model prediction accuracy
pred_features = []     # Current features in model
target_feature = None     # Current best feature
best_coefs = None
features = feat
X['intercept']=1
while len(pred_features) < len(features):
    for feature in list(set(features) - set(pred_features)):
        folds = 5
        sp = len(X)/folds
        err = []
        for i in range(folds):
            lower = int(sp * i)
            upper = int(sp * (i+1))
            test_x = X.iloc[lower:upper]
            test_y = y.iloc[lower:upper]
            train_x = pd.concat((X.iloc[:lower], X.iloc[upper:]))
            train_y = pd.concat((y.iloc[:lower], y.iloc[upper:]))    
            
            try_features = pred_features + [feature]
            coefs = generate_model(train_x, train_y, try_features)
            err += [calc_accuracy(test_x, test_y, coefs, try_features)]
        cur_err = sum(err)/len(err)
        # Record feature if it has lowest error so far
        if last_err is None or cur_err < last_err:
            last_err = cur_err
            target_feature = feature
            best_coefs = coefs
    # If the best feature improves the model's prediction error, add it
    if last_err < model_err:
        pred_features.append(target_feature)
        model_err = last_err
        print("Chose: " + target_feature + " (" + str(model_err) + ")")
    # Otherwise, quit
    else:
        print("Found Optimal Model, (%d features of %d)" % (len(pred_features), len(features)))
        print(pred_features)
        print(best_coefs)
        print(calc_accuracy(test_x, test_y, best_coefs, pred_features, True))
        break

Chose: life_sq (0.0014645458910447437)
Chose: product_type (0.0013254011334959189)
Chose: park_km (0.0012672458829675703)
Chose: bandwidth_sports (0.0012296944083203007)
Chose: office_count_5000 (0.001199473392690629)
Chose: culture_objects_top_25_raion (0.0011726579879252682)
Chose: floor (0.0011512000631567207)
Chose: ekder_all (0.0011395880252318608)
Chose: num_room (0.0011289097240660764)
Chose: big_church_count_5000 (0.0011192005550694438)
Chose: indust_part (0.0011117174383623138)
Chose: mosque_count_5000 (0.0011064358148436839)
Chose: ecology (0.0011006496538824198)
Chose: healthcare_centers_raion (0.0010962179574878724)
Chose: railroad_terminal_raion (0.0010938169271714517)
Chose: fitness_km (0.001091741219474969)
Chose: big_market_raion (0.0010893603683215206)
Chose: big_road1_km (0.0010872450522384104)
Chose: green_zone_part (0.00108614447370202)
Chose: nuclear_reactor_raion (0.001085363248575072)
Chose: work_all (0.001084697519828985)
Chose: cpi (0.0010840996561873972)
Chose