# Modeling & predictions

### Import libraries

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
#from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
#from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso
#from sklearn.utils import resample
from sklearn.metrics import r2_score

### Load data

In [5]:
prep_X_train = pd.read_csv('data/preprocessed_X_train.csv')
prep_X_train_scaled = pd.read_csv('data/preprocessed_X_train_scaled.csv')

prep_y_train = pd.read_csv('data/preprocessed_y_train.csv')

prep_X_test = pd.read_csv('data/preprocessed_X_test.csv')
prep_X_test_scaled = pd.read_csv('data/preprocessed_X_test_scaled.csv')
orig_X_test = pd.read_csv('data/test.csv')

## Compare basic versions of different models

In [6]:
def test_basic_versions_of_models(X, y, test_size=0.2, random_state=42, use_scaler=True):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    if use_scaler:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    regressors = {
        'Dummy Model': DummyRegressor(strategy='mean'),
        'OLS Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=random_state),
        'SVR': SVR(),
        'Lasso': Lasso(random_state=random_state),
    }

    for name, regressor in regressors.items():
        model = regressor.fit(X_train, y_train.values.ravel())
        predictions = model.predict(X_test)
        r2 = r2_score(y_test, predictions)
        print(f'{name}: R^2 Score = {r2:.4f}')

### Without standard scaler

In [7]:
test_basic_versions_of_models(X=prep_X_train, y=prep_y_train, test_size=0.2, random_state=42, use_scaler=False)

Dummy Model: R^2 Score = -0.0000
OLS Linear Regression: R^2 Score = 0.6957
Random Forest: R^2 Score = 0.7105
SVR: R^2 Score = 0.4713
Lasso: R^2 Score = 0.3295


### With standard scaler

In [8]:
test_basic_versions_of_models(X=prep_X_train, y=prep_y_train, test_size=0.2, random_state=42, use_scaler=True)

Dummy Model: R^2 Score = -0.0000
OLS Linear Regression: R^2 Score = 0.6944
Random Forest: R^2 Score = 0.7106
SVR: R^2 Score = 0.7383
Lasso: R^2 Score = 0.2664


## Predict on actual test data

### SVR basic version

In [9]:
model = SVR().fit(prep_X_train_scaled, prep_y_train.values.ravel())

predictions = model.predict(prep_X_test_scaled)

results = pd.DataFrame()
results['Id'] = orig_X_test['Id']
results['target'] = predictions

results.head()

Unnamed: 0,Id,target
0,1000001,1.100978
1,1000002,-1.457233
2,1000003,-1.999204
3,1000004,-3.772044
4,1000005,-4.5086


In [10]:
results.to_csv('data/results.csv', index=False)